blob: 2dea8a5a170bb5eca6b6dd6bfc9d01f41ef9c703 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200446 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200453 share_utf8 = (_PyUnicode_UTF8(unicode) == data);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200454
455 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
456 PyErr_NoMemory();
457 return -1;
458 }
459 new_size = (length + 1) * char_size;
460
461 data = (PyObject *)PyObject_REALLOC(data, new_size);
462 if (data == NULL) {
463 PyErr_NoMemory();
464 return -1;
465 }
466 _PyUnicode_DATA_ANY(unicode) = data;
467 if (share_wstr)
468 _PyUnicode_WSTR(unicode) = data;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200469 if (share_utf8)
470 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200471 _PyUnicode_LENGTH(unicode) = length;
472 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
473 if (share_wstr)
474 return 0;
475 }
476 if (_PyUnicode_WSTR(unicode) != NULL) {
477 assert(_PyUnicode_WSTR(unicode) != NULL);
478
479 oldstr = _PyUnicode_WSTR(unicode);
480 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
481 sizeof(Py_UNICODE) * (length + 1));
482 if (!_PyUnicode_WSTR(unicode)) {
483 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
484 PyErr_NoMemory();
485 return -1;
486 }
487 _PyUnicode_WSTR(unicode)[length] = 0;
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490 return 0;
491}
492
Victor Stinnerfe226c02011-10-03 03:52:20 +0200493static PyObject*
494resize_copy(PyObject *unicode, Py_ssize_t length)
495{
496 Py_ssize_t copy_length;
497 if (PyUnicode_IS_COMPACT(unicode)) {
498 PyObject *copy;
499 assert(PyUnicode_IS_READY(unicode));
500
501 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
502 if (copy == NULL)
503 return NULL;
504
505 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
506 if (PyUnicode_CopyCharacters(copy, 0,
507 unicode, 0,
508 copy_length) < 0)
509 {
510 Py_DECREF(copy);
511 return NULL;
512 }
513 return copy;
514 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200515 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200516 assert(_PyUnicode_WSTR(unicode) != NULL);
517 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200518 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200519 if (w == NULL)
520 return NULL;
521 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
522 copy_length = Py_MIN(copy_length, length);
523 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
524 copy_length);
525 return (PyObject*)w;
526 }
527}
528
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000530 Ux0000 terminated; some code (e.g. new_identifier)
531 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532
533 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535
536*/
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#ifdef Py_DEBUG
539int unicode_old_new_calls = 0;
540#endif
541
Alexander Belopolsky40018472011-02-26 01:02:56 +0000542static PyUnicodeObject *
543_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544{
545 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 if (length == 0 && unicode_empty != NULL) {
550 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200551 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552 }
553
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000554 /* Ensure we won't overflow the size. */
555 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
556 return (PyUnicodeObject *)PyErr_NoMemory();
557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558 if (length < 0) {
559 PyErr_SetString(PyExc_SystemError,
560 "Negative size passed to _PyUnicode_New");
561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 }
563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564#ifdef Py_DEBUG
565 ++unicode_old_new_calls;
566#endif
567
568 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
569 if (unicode == NULL)
570 return NULL;
571 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
572 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
573 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000574 PyErr_NoMemory();
575 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577
Jeremy Hyltond8082792003-09-16 19:41:39 +0000578 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000579 * the caller fails before initializing str -- unicode_resize()
580 * reads str[0], and the Keep-Alive optimization can keep memory
581 * allocated for str alive across a call to unicode_dealloc(unicode).
582 * We don't want unicode_resize to read uninitialized memory in
583 * that case.
584 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200585 _PyUnicode_WSTR(unicode)[0] = 0;
586 _PyUnicode_WSTR(unicode)[length] = 0;
587 _PyUnicode_WSTR_LENGTH(unicode) = length;
588 _PyUnicode_HASH(unicode) = -1;
589 _PyUnicode_STATE(unicode).interned = 0;
590 _PyUnicode_STATE(unicode).kind = 0;
591 _PyUnicode_STATE(unicode).compact = 0;
592 _PyUnicode_STATE(unicode).ready = 0;
593 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200594 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200595 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200596 _PyUnicode_UTF8(unicode) = NULL;
597 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000599
Benjamin Peterson29060642009-01-31 22:14:21 +0000600 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000601 /* XXX UNREF/NEWREF interface should be more symmetrical */
602 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000603 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000604 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606}
607
Victor Stinnerf42dc442011-10-02 23:33:16 +0200608static const char*
609unicode_kind_name(PyObject *unicode)
610{
Victor Stinner910337b2011-10-03 03:20:16 +0200611 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200612 if (!PyUnicode_IS_COMPACT(unicode))
613 {
614 if (!PyUnicode_IS_READY(unicode))
615 return "wstr";
616 switch(PyUnicode_KIND(unicode))
617 {
618 case PyUnicode_1BYTE_KIND:
619 if (PyUnicode_IS_COMPACT_ASCII(unicode))
620 return "legacy ascii";
621 else
622 return "legacy latin1";
623 case PyUnicode_2BYTE_KIND:
624 return "legacy UCS2";
625 case PyUnicode_4BYTE_KIND:
626 return "legacy UCS4";
627 default:
628 return "<legacy invalid kind>";
629 }
630 }
631 assert(PyUnicode_IS_READY(unicode));
632 switch(PyUnicode_KIND(unicode))
633 {
634 case PyUnicode_1BYTE_KIND:
635 if (PyUnicode_IS_COMPACT_ASCII(unicode))
636 return "ascii";
637 else
638 return "compact latin1";
639 case PyUnicode_2BYTE_KIND:
640 return "compact UCS2";
641 case PyUnicode_4BYTE_KIND:
642 return "compact UCS4";
643 default:
644 return "<invalid compact kind>";
645 }
646}
647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648#ifdef Py_DEBUG
649int unicode_new_new_calls = 0;
650
651/* Functions wrapping macros for use in debugger */
652char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200653 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654}
655
656void *_PyUnicode_compact_data(void *unicode) {
657 return _PyUnicode_COMPACT_DATA(unicode);
658}
659void *_PyUnicode_data(void *unicode){
660 printf("obj %p\n", unicode);
661 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
662 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
663 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
664 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
665 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
666 return PyUnicode_DATA(unicode);
667}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200668
669void
670_PyUnicode_Dump(PyObject *op)
671{
672 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200673 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
674 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
675 void *data;
676 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
677 if (ascii->state.compact)
678 data = (compact + 1);
679 else
680 data = unicode->data.any;
681 if (ascii->wstr == data)
682 printf("shared ");
683 printf("wstr=%p", ascii->wstr);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200684 if (!ascii->state.ascii) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200685 printf(" (%zu), ", compact->wstr_length);
686 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
687 printf("shared ");
688 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200689 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200690 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200691}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692#endif
693
694PyObject *
695PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
696{
697 PyObject *obj;
698 PyCompactUnicodeObject *unicode;
699 void *data;
700 int kind_state;
701 int is_sharing = 0, is_ascii = 0;
702 Py_ssize_t char_size;
703 Py_ssize_t struct_size;
704
705 /* Optimization for empty strings */
706 if (size == 0 && unicode_empty != NULL) {
707 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200708 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 }
710
711#ifdef Py_DEBUG
712 ++unicode_new_new_calls;
713#endif
714
715 struct_size = sizeof(PyCompactUnicodeObject);
716 if (maxchar < 128) {
717 kind_state = PyUnicode_1BYTE_KIND;
718 char_size = 1;
719 is_ascii = 1;
720 struct_size = sizeof(PyASCIIObject);
721 }
722 else if (maxchar < 256) {
723 kind_state = PyUnicode_1BYTE_KIND;
724 char_size = 1;
725 }
726 else if (maxchar < 65536) {
727 kind_state = PyUnicode_2BYTE_KIND;
728 char_size = 2;
729 if (sizeof(wchar_t) == 2)
730 is_sharing = 1;
731 }
732 else {
733 kind_state = PyUnicode_4BYTE_KIND;
734 char_size = 4;
735 if (sizeof(wchar_t) == 4)
736 is_sharing = 1;
737 }
738
739 /* Ensure we won't overflow the size. */
740 if (size < 0) {
741 PyErr_SetString(PyExc_SystemError,
742 "Negative size passed to PyUnicode_New");
743 return NULL;
744 }
745 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
746 return PyErr_NoMemory();
747
748 /* Duplicated allocation code from _PyObject_New() instead of a call to
749 * PyObject_New() so we are able to allocate space for the object and
750 * it's data buffer.
751 */
752 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
753 if (obj == NULL)
754 return PyErr_NoMemory();
755 obj = PyObject_INIT(obj, &PyUnicode_Type);
756 if (obj == NULL)
757 return NULL;
758
759 unicode = (PyCompactUnicodeObject *)obj;
760 if (is_ascii)
761 data = ((PyASCIIObject*)obj) + 1;
762 else
763 data = unicode + 1;
764 _PyUnicode_LENGTH(unicode) = size;
765 _PyUnicode_HASH(unicode) = -1;
766 _PyUnicode_STATE(unicode).interned = 0;
767 _PyUnicode_STATE(unicode).kind = kind_state;
768 _PyUnicode_STATE(unicode).compact = 1;
769 _PyUnicode_STATE(unicode).ready = 1;
770 _PyUnicode_STATE(unicode).ascii = is_ascii;
771 if (is_ascii) {
772 ((char*)data)[size] = 0;
773 _PyUnicode_WSTR(unicode) = NULL;
774 }
775 else if (kind_state == PyUnicode_1BYTE_KIND) {
776 ((char*)data)[size] = 0;
777 _PyUnicode_WSTR(unicode) = NULL;
778 _PyUnicode_WSTR_LENGTH(unicode) = 0;
779 unicode->utf8_length = 0;
780 unicode->utf8 = NULL;
781 }
782 else {
783 unicode->utf8 = NULL;
784 if (kind_state == PyUnicode_2BYTE_KIND)
785 ((Py_UCS2*)data)[size] = 0;
786 else /* kind_state == PyUnicode_4BYTE_KIND */
787 ((Py_UCS4*)data)[size] = 0;
788 if (is_sharing) {
789 _PyUnicode_WSTR_LENGTH(unicode) = size;
790 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
791 }
792 else {
793 _PyUnicode_WSTR_LENGTH(unicode) = 0;
794 _PyUnicode_WSTR(unicode) = NULL;
795 }
796 }
797 return obj;
798}
799
800#if SIZEOF_WCHAR_T == 2
801/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
802 will decode surrogate pairs, the other conversions are implemented as macros
803 for efficency.
804
805 This function assumes that unicode can hold one more code point than wstr
806 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200807static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
809 PyUnicodeObject *unicode)
810{
811 const wchar_t *iter;
812 Py_UCS4 *ucs4_out;
813
Victor Stinner910337b2011-10-03 03:20:16 +0200814 assert(unicode != NULL);
815 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200816 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
817 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
818
819 for (iter = begin; iter < end; ) {
820 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
821 _PyUnicode_GET_LENGTH(unicode)));
822 if (*iter >= 0xD800 && *iter <= 0xDBFF
823 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
824 {
825 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
826 iter += 2;
827 }
828 else {
829 *ucs4_out++ = *iter;
830 iter++;
831 }
832 }
833 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
834 _PyUnicode_GET_LENGTH(unicode)));
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836}
837#endif
838
Victor Stinnercd9950f2011-10-02 00:34:53 +0200839static int
840_PyUnicode_Dirty(PyObject *unicode)
841{
Victor Stinner910337b2011-10-03 03:20:16 +0200842 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200843 if (Py_REFCNT(unicode) != 1) {
844 PyErr_SetString(PyExc_ValueError,
845 "Cannot modify a string having more than 1 reference");
846 return -1;
847 }
848 _PyUnicode_DIRTY(unicode);
849 return 0;
850}
851
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200852Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
854 PyObject *from, Py_ssize_t from_start,
855 Py_ssize_t how_many)
856{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200857 unsigned int from_kind, to_kind;
858 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859
Victor Stinnerb1536152011-09-30 02:26:10 +0200860 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
861 PyErr_BadInternalCall();
862 return -1;
863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864
865 if (PyUnicode_READY(from))
866 return -1;
867 if (PyUnicode_READY(to))
868 return -1;
869
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200870 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200871 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
872 PyErr_Format(PyExc_ValueError,
873 "Cannot write %zi characters at %zi "
874 "in a string of %zi characters",
875 how_many, to_start, PyUnicode_GET_LENGTH(to));
876 return -1;
877 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200878 if (how_many == 0)
879 return 0;
880
Victor Stinnercd9950f2011-10-02 00:34:53 +0200881 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200882 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200885 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200887 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 if (from_kind == to_kind
890 /* deny latin1 => ascii */
891 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
892 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200893 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200894 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200895 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200896 + PyUnicode_KIND_SIZE(from_kind, from_start),
897 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200899 else if (from_kind == PyUnicode_1BYTE_KIND
900 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200901 {
902 _PyUnicode_CONVERT_BYTES(
903 Py_UCS1, Py_UCS2,
904 PyUnicode_1BYTE_DATA(from) + from_start,
905 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
906 PyUnicode_2BYTE_DATA(to) + to_start
907 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200908 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200909 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200910 && to_kind == PyUnicode_4BYTE_KIND)
911 {
912 _PyUnicode_CONVERT_BYTES(
913 Py_UCS1, Py_UCS4,
914 PyUnicode_1BYTE_DATA(from) + from_start,
915 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
916 PyUnicode_4BYTE_DATA(to) + to_start
917 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200918 }
919 else if (from_kind == PyUnicode_2BYTE_KIND
920 && to_kind == PyUnicode_4BYTE_KIND)
921 {
922 _PyUnicode_CONVERT_BYTES(
923 Py_UCS2, Py_UCS4,
924 PyUnicode_2BYTE_DATA(from) + from_start,
925 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
926 PyUnicode_4BYTE_DATA(to) + to_start
927 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200928 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200929 else {
930 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200931
932 /* check if max_char(from substring) <= max_char(to) */
933 if (from_kind > to_kind
934 /* latin1 => ascii */
935 || (PyUnicode_IS_COMPACT_ASCII(to)
936 && to_kind == PyUnicode_1BYTE_KIND
937 && !PyUnicode_IS_COMPACT_ASCII(from)))
938 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200939 /* slow path to check for character overflow */
940 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
941 Py_UCS4 ch, maxchar;
942 Py_ssize_t i;
943
944 maxchar = 0;
945 invalid_kinds = 0;
946 for (i=0; i < how_many; i++) {
947 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
948 if (ch > maxchar) {
949 maxchar = ch;
950 if (maxchar > to_maxchar) {
951 invalid_kinds = 1;
952 break;
953 }
954 }
955 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
956 }
957 }
958 else
959 invalid_kinds = 1;
960 if (invalid_kinds) {
961 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 "Cannot copy %s characters "
963 "into a string of %s characters",
964 unicode_kind_name(from),
965 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 return -1;
967 }
968 }
969 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970}
971
Victor Stinner17222162011-09-28 22:15:37 +0200972/* Find the maximum code point and count the number of surrogate pairs so a
973 correct string length can be computed before converting a string to UCS4.
974 This function counts single surrogates as a character and not as a pair.
975
976 Return 0 on success, or -1 on error. */
977static int
978find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
979 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980{
981 const wchar_t *iter;
982
Victor Stinnerc53be962011-10-02 21:33:54 +0200983 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 if (num_surrogates == NULL || maxchar == NULL) {
985 PyErr_SetString(PyExc_SystemError,
986 "unexpected NULL arguments to "
987 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
988 return -1;
989 }
990
991 *num_surrogates = 0;
992 *maxchar = 0;
993
994 for (iter = begin; iter < end; ) {
995 if (*iter > *maxchar)
996 *maxchar = *iter;
997#if SIZEOF_WCHAR_T == 2
998 if (*iter >= 0xD800 && *iter <= 0xDBFF
999 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1000 {
1001 Py_UCS4 surrogate_val;
1002 surrogate_val = (((iter[0] & 0x3FF)<<10)
1003 | (iter[1] & 0x3FF)) + 0x10000;
1004 ++(*num_surrogates);
1005 if (surrogate_val > *maxchar)
1006 *maxchar = surrogate_val;
1007 iter += 2;
1008 }
1009 else
1010 iter++;
1011#else
1012 iter++;
1013#endif
1014 }
1015 return 0;
1016}
1017
1018#ifdef Py_DEBUG
1019int unicode_ready_calls = 0;
1020#endif
1021
1022int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001023_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001024{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001025 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001026 wchar_t *end;
1027 Py_UCS4 maxchar = 0;
1028 Py_ssize_t num_surrogates;
1029#if SIZEOF_WCHAR_T == 2
1030 Py_ssize_t length_wo_surrogates;
1031#endif
1032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001034 strings were created using _PyObject_New() and where no canonical
1035 representation (the str field) has been set yet aka strings
1036 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001037 assert(_PyUnicode_CHECK(unicode));
1038 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001040 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001041 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001042 /* Actually, it should neither be interned nor be anything else: */
1043 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044
1045#ifdef Py_DEBUG
1046 ++unicode_ready_calls;
1047#endif
1048
1049 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001050 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001051 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053
1054 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001055 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1056 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 PyErr_NoMemory();
1058 return -1;
1059 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001060 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 _PyUnicode_WSTR(unicode), end,
1062 PyUnicode_1BYTE_DATA(unicode));
1063 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1064 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1065 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1066 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001067 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001068 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 }
1070 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001071 _PyUnicode_UTF8(unicode) = NULL;
1072 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 }
1074 PyObject_FREE(_PyUnicode_WSTR(unicode));
1075 _PyUnicode_WSTR(unicode) = NULL;
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 }
1078 /* In this case we might have to convert down from 4-byte native
1079 wchar_t to 2-byte unicode. */
1080 else if (maxchar < 65536) {
1081 assert(num_surrogates == 0 &&
1082 "FindMaxCharAndNumSurrogatePairs() messed up");
1083
Victor Stinner506f5922011-09-28 22:34:18 +02001084#if SIZEOF_WCHAR_T == 2
1085 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001086 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001087 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1088 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1089 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001090 _PyUnicode_UTF8(unicode) = NULL;
1091 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001092#else
1093 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001094 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001095 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001096 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001097 PyErr_NoMemory();
1098 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099 }
Victor Stinner506f5922011-09-28 22:34:18 +02001100 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1101 _PyUnicode_WSTR(unicode), end,
1102 PyUnicode_2BYTE_DATA(unicode));
1103 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1104 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1105 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001106 _PyUnicode_UTF8(unicode) = NULL;
1107 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001108 PyObject_FREE(_PyUnicode_WSTR(unicode));
1109 _PyUnicode_WSTR(unicode) = NULL;
1110 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1111#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 }
1113 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1114 else {
1115#if SIZEOF_WCHAR_T == 2
1116 /* in case the native representation is 2-bytes, we need to allocate a
1117 new normalized 4-byte version. */
1118 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001119 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1120 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 PyErr_NoMemory();
1122 return -1;
1123 }
1124 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1125 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001126 _PyUnicode_UTF8(unicode) = NULL;
1127 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001128 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1129 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001130 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 PyObject_FREE(_PyUnicode_WSTR(unicode));
1132 _PyUnicode_WSTR(unicode) = NULL;
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134#else
1135 assert(num_surrogates == 0);
1136
Victor Stinnerc3c74152011-10-02 20:39:55 +02001137 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1142#endif
1143 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1144 }
1145 _PyUnicode_STATE(unicode).ready = 1;
1146 return 0;
1147}
1148
Alexander Belopolsky40018472011-02-26 01:02:56 +00001149static void
1150unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151{
Walter Dörwald16807132007-05-25 13:52:07 +00001152 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 case SSTATE_NOT_INTERNED:
1154 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001155
Benjamin Peterson29060642009-01-31 22:14:21 +00001156 case SSTATE_INTERNED_MORTAL:
1157 /* revive dead object temporarily for DelItem */
1158 Py_REFCNT(unicode) = 3;
1159 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1160 Py_FatalError(
1161 "deletion of interned string failed");
1162 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001163
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 case SSTATE_INTERNED_IMMORTAL:
1165 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001166
Benjamin Peterson29060642009-01-31 22:14:21 +00001167 default:
1168 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001169 }
1170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 if (_PyUnicode_WSTR(unicode) &&
1172 (!PyUnicode_IS_READY(unicode) ||
1173 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1174 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001175 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001176 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001177
1178 if (PyUnicode_IS_COMPACT(unicode)) {
1179 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 }
1181 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001182 if (_PyUnicode_DATA_ANY(unicode))
1183 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
1186}
1187
Alexander Belopolsky40018472011-02-26 01:02:56 +00001188static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001189unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001190{
Victor Stinnera3be6132011-10-03 02:16:37 +02001191 Py_ssize_t len;
Victor Stinnerca4f7a42011-10-03 04:18:04 +02001192#if SIZEOF_WCHAR_T == 2
1193 /* FIXME: unicode_resize() is buggy on Windows */
1194 return 0;
1195#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001196 if (Py_REFCNT(unicode) != 1)
1197 return 0;
1198 if (PyUnicode_CHECK_INTERNED(unicode))
1199 return 0;
1200 if (unicode == unicode_empty)
1201 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001202 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1203 len = PyUnicode_WSTR_LENGTH(unicode);
1204 else
1205 len = PyUnicode_GET_LENGTH(unicode);
1206 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001207 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001208 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001209 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001210 else
1211 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001212 if (ch < 256 && unicode_latin1[ch] == unicode)
1213 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001214 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001215 return 1;
1216}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001217
Victor Stinnerfe226c02011-10-03 03:52:20 +02001218static int
1219unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1220{
1221 PyObject *unicode;
1222 Py_ssize_t old_length;
1223
1224 assert(p_unicode != NULL);
1225 unicode = *p_unicode;
1226
1227 assert(unicode != NULL);
1228 assert(PyUnicode_Check(unicode));
1229 assert(0 <= length);
1230
Victor Stinner910337b2011-10-03 03:20:16 +02001231 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001232 old_length = PyUnicode_WSTR_LENGTH(unicode);
1233 else
1234 old_length = PyUnicode_GET_LENGTH(unicode);
1235 if (old_length == length)
1236 return 0;
1237
1238 /* FIXME: really create a new object? */
1239 if (!unicode_resizable(unicode)) {
1240 PyObject *copy = resize_copy(unicode, length);
1241 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001242 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001243 Py_DECREF(*p_unicode);
1244 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001245 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001246 }
1247
Victor Stinnerfe226c02011-10-03 03:52:20 +02001248 if (PyUnicode_IS_COMPACT(unicode)) {
1249 *p_unicode = resize_compact(unicode, length);
1250 if (*p_unicode == NULL)
1251 return -1;
1252 return 0;
1253 } else
1254 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001255}
1256
Alexander Belopolsky40018472011-02-26 01:02:56 +00001257int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001258PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001259{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001260 PyObject *unicode;
1261 if (p_unicode == NULL) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265 unicode = *p_unicode;
1266 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1267 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1268 {
1269 PyErr_BadInternalCall();
1270 return -1;
1271 }
1272 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001273}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275static PyObject*
1276get_latin1_char(unsigned char ch)
1277{
Victor Stinnera464fc12011-10-02 20:39:30 +02001278 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001279 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001280 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281 if (!unicode)
1282 return NULL;
1283 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1284 unicode_latin1[ch] = unicode;
1285 }
1286 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001287 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288}
1289
Alexander Belopolsky40018472011-02-26 01:02:56 +00001290PyObject *
1291PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292{
1293 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 Py_UCS4 maxchar = 0;
1295 Py_ssize_t num_surrogates;
1296
1297 if (u == NULL)
1298 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001300 /* If the Unicode data is known at construction time, we can apply
1301 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303 /* Optimization for empty strings */
1304 if (size == 0 && unicode_empty != NULL) {
1305 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001306 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001307 }
Tim Petersced69f82003-09-16 20:30:58 +00001308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309 /* Single character Unicode objects in the Latin-1 range are
1310 shared when using this constructor */
1311 if (size == 1 && *u < 256)
1312 return get_latin1_char((unsigned char)*u);
1313
1314 /* If not empty and not single character, copy the Unicode data
1315 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001316 if (find_maxchar_surrogates(u, u + size,
1317 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 return NULL;
1319
1320 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1321 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 if (!unicode)
1323 return NULL;
1324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 switch (PyUnicode_KIND(unicode)) {
1326 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001327 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1329 break;
1330 case PyUnicode_2BYTE_KIND:
1331#if Py_UNICODE_SIZE == 2
1332 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1333#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001334 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1336#endif
1337 break;
1338 case PyUnicode_4BYTE_KIND:
1339#if SIZEOF_WCHAR_T == 2
1340 /* This is the only case which has to process surrogates, thus
1341 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001342 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343#else
1344 assert(num_surrogates == 0);
1345 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1346#endif
1347 break;
1348 default:
1349 assert(0 && "Impossible state");
1350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351
1352 return (PyObject *)unicode;
1353}
1354
Alexander Belopolsky40018472011-02-26 01:02:56 +00001355PyObject *
1356PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001357{
1358 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001359
Benjamin Peterson14339b62009-01-31 16:36:08 +00001360 if (size < 0) {
1361 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001363 return NULL;
1364 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001365
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001366 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001367 some optimizations which share commonly used objects.
1368 Also, this means the input must be UTF-8, so fall back to the
1369 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001370 if (u != NULL) {
1371
Benjamin Peterson29060642009-01-31 22:14:21 +00001372 /* Optimization for empty strings */
1373 if (size == 0 && unicode_empty != NULL) {
1374 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001375 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001376 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001377
1378 /* Single characters are shared when using this constructor.
1379 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 if (size == 1 && Py_CHARMASK(*u) < 128)
1381 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001382
1383 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001384 }
1385
Walter Dörwald55507312007-05-18 13:12:10 +00001386 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001387 if (!unicode)
1388 return NULL;
1389
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001390 return (PyObject *)unicode;
1391}
1392
Alexander Belopolsky40018472011-02-26 01:02:56 +00001393PyObject *
1394PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001395{
1396 size_t size = strlen(u);
1397 if (size > PY_SSIZE_T_MAX) {
1398 PyErr_SetString(PyExc_OverflowError, "input too long");
1399 return NULL;
1400 }
1401
1402 return PyUnicode_FromStringAndSize(u, size);
1403}
1404
Victor Stinnere57b1c02011-09-28 22:20:48 +02001405static PyObject*
1406_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 PyObject *res;
1409 unsigned char max = 127;
1410 Py_ssize_t i;
1411 for (i = 0; i < size; i++) {
1412 if (u[i] & 0x80) {
1413 max = 255;
1414 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001415 }
1416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 res = PyUnicode_New(size, max);
1418 if (!res)
1419 return NULL;
1420 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1421 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001422}
1423
Victor Stinnere57b1c02011-09-28 22:20:48 +02001424static PyObject*
1425_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426{
1427 PyObject *res;
1428 Py_UCS2 max = 0;
1429 Py_ssize_t i;
1430 for (i = 0; i < size; i++)
1431 if (u[i] > max)
1432 max = u[i];
1433 res = PyUnicode_New(size, max);
1434 if (!res)
1435 return NULL;
1436 if (max >= 256)
1437 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1438 else
1439 for (i = 0; i < size; i++)
1440 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1441 return res;
1442}
1443
Victor Stinnere57b1c02011-09-28 22:20:48 +02001444static PyObject*
1445_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446{
1447 PyObject *res;
1448 Py_UCS4 max = 0;
1449 Py_ssize_t i;
1450 for (i = 0; i < size; i++)
1451 if (u[i] > max)
1452 max = u[i];
1453 res = PyUnicode_New(size, max);
1454 if (!res)
1455 return NULL;
1456 if (max >= 0x10000)
1457 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1458 else {
1459 int kind = PyUnicode_KIND(res);
1460 void *data = PyUnicode_DATA(res);
1461 for (i = 0; i < size; i++)
1462 PyUnicode_WRITE(kind, data, i, u[i]);
1463 }
1464 return res;
1465}
1466
1467PyObject*
1468PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1469{
1470 switch(kind) {
1471 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001472 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001474 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001476 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001478 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 return NULL;
1480}
1481
Victor Stinner034f6cf2011-09-30 02:26:44 +02001482PyObject*
1483PyUnicode_Copy(PyObject *unicode)
1484{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001485 Py_ssize_t size;
1486 PyObject *copy;
1487 void *data;
1488
Victor Stinner034f6cf2011-09-30 02:26:44 +02001489 if (!PyUnicode_Check(unicode)) {
1490 PyErr_BadInternalCall();
1491 return NULL;
1492 }
1493 if (PyUnicode_READY(unicode))
1494 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001495
1496 size = PyUnicode_GET_LENGTH(unicode);
1497 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1498 if (!copy)
1499 return NULL;
1500 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1501
1502 data = PyUnicode_DATA(unicode);
1503 switch (PyUnicode_KIND(unicode))
1504 {
1505 case PyUnicode_1BYTE_KIND:
1506 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1507 break;
1508 case PyUnicode_2BYTE_KIND:
1509 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1510 break;
1511 case PyUnicode_4BYTE_KIND:
1512 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1513 break;
1514 default:
1515 assert(0);
1516 break;
1517 }
1518 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001519}
1520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521
Victor Stinnerbc603d12011-10-02 01:00:40 +02001522/* Widen Unicode objects to larger buffers. Don't write terminating null
1523 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524
1525void*
1526_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1527{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001528 Py_ssize_t len;
1529 void *result;
1530 unsigned int skind;
1531
1532 if (PyUnicode_READY(s))
1533 return NULL;
1534
1535 len = PyUnicode_GET_LENGTH(s);
1536 skind = PyUnicode_KIND(s);
1537 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1539 return NULL;
1540 }
1541 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001542 case PyUnicode_2BYTE_KIND:
1543 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1544 if (!result)
1545 return PyErr_NoMemory();
1546 assert(skind == PyUnicode_1BYTE_KIND);
1547 _PyUnicode_CONVERT_BYTES(
1548 Py_UCS1, Py_UCS2,
1549 PyUnicode_1BYTE_DATA(s),
1550 PyUnicode_1BYTE_DATA(s) + len,
1551 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001553 case PyUnicode_4BYTE_KIND:
1554 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1555 if (!result)
1556 return PyErr_NoMemory();
1557 if (skind == PyUnicode_2BYTE_KIND) {
1558 _PyUnicode_CONVERT_BYTES(
1559 Py_UCS2, Py_UCS4,
1560 PyUnicode_2BYTE_DATA(s),
1561 PyUnicode_2BYTE_DATA(s) + len,
1562 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001564 else {
1565 assert(skind == PyUnicode_1BYTE_KIND);
1566 _PyUnicode_CONVERT_BYTES(
1567 Py_UCS1, Py_UCS4,
1568 PyUnicode_1BYTE_DATA(s),
1569 PyUnicode_1BYTE_DATA(s) + len,
1570 result);
1571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001573 default:
1574 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001576 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 return NULL;
1578}
1579
1580static Py_UCS4*
1581as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1582 int copy_null)
1583{
1584 int kind;
1585 void *data;
1586 Py_ssize_t len, targetlen;
1587 if (PyUnicode_READY(string) == -1)
1588 return NULL;
1589 kind = PyUnicode_KIND(string);
1590 data = PyUnicode_DATA(string);
1591 len = PyUnicode_GET_LENGTH(string);
1592 targetlen = len;
1593 if (copy_null)
1594 targetlen++;
1595 if (!target) {
1596 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1597 PyErr_NoMemory();
1598 return NULL;
1599 }
1600 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1601 if (!target) {
1602 PyErr_NoMemory();
1603 return NULL;
1604 }
1605 }
1606 else {
1607 if (targetsize < targetlen) {
1608 PyErr_Format(PyExc_SystemError,
1609 "string is longer than the buffer");
1610 if (copy_null && 0 < targetsize)
1611 target[0] = 0;
1612 return NULL;
1613 }
1614 }
1615 if (kind != PyUnicode_4BYTE_KIND) {
1616 Py_ssize_t i;
1617 for (i = 0; i < len; i++)
1618 target[i] = PyUnicode_READ(kind, data, i);
1619 }
1620 else
1621 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1622 if (copy_null)
1623 target[len] = 0;
1624 return target;
1625}
1626
1627Py_UCS4*
1628PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1629 int copy_null)
1630{
1631 if (target == NULL || targetsize < 1) {
1632 PyErr_BadInternalCall();
1633 return NULL;
1634 }
1635 return as_ucs4(string, target, targetsize, copy_null);
1636}
1637
1638Py_UCS4*
1639PyUnicode_AsUCS4Copy(PyObject *string)
1640{
1641 return as_ucs4(string, NULL, 0, 1);
1642}
1643
1644#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001645
Alexander Belopolsky40018472011-02-26 01:02:56 +00001646PyObject *
1647PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001650 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001651 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001652 PyErr_BadInternalCall();
1653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 }
1655
Martin v. Löwis790465f2008-04-05 20:41:37 +00001656 if (size == -1) {
1657 size = wcslen(w);
1658 }
1659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661}
1662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001664
Walter Dörwald346737f2007-05-31 10:44:43 +00001665static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001666makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1667 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001668{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001669 *fmt++ = '%';
1670 if (width) {
1671 if (zeropad)
1672 *fmt++ = '0';
1673 fmt += sprintf(fmt, "%d", width);
1674 }
1675 if (precision)
1676 fmt += sprintf(fmt, ".%d", precision);
1677 if (longflag)
1678 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001679 else if (longlongflag) {
1680 /* longlongflag should only ever be nonzero on machines with
1681 HAVE_LONG_LONG defined */
1682#ifdef HAVE_LONG_LONG
1683 char *f = PY_FORMAT_LONG_LONG;
1684 while (*f)
1685 *fmt++ = *f++;
1686#else
1687 /* we shouldn't ever get here */
1688 assert(0);
1689 *fmt++ = 'l';
1690#endif
1691 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001692 else if (size_tflag) {
1693 char *f = PY_FORMAT_SIZE_T;
1694 while (*f)
1695 *fmt++ = *f++;
1696 }
1697 *fmt++ = c;
1698 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001699}
1700
Victor Stinner96865452011-03-01 23:44:09 +00001701/* helper for PyUnicode_FromFormatV() */
1702
1703static const char*
1704parse_format_flags(const char *f,
1705 int *p_width, int *p_precision,
1706 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1707{
1708 int width, precision, longflag, longlongflag, size_tflag;
1709
1710 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1711 f++;
1712 width = 0;
1713 while (Py_ISDIGIT((unsigned)*f))
1714 width = (width*10) + *f++ - '0';
1715 precision = 0;
1716 if (*f == '.') {
1717 f++;
1718 while (Py_ISDIGIT((unsigned)*f))
1719 precision = (precision*10) + *f++ - '0';
1720 if (*f == '%') {
1721 /* "%.3%s" => f points to "3" */
1722 f--;
1723 }
1724 }
1725 if (*f == '\0') {
1726 /* bogus format "%.1" => go backward, f points to "1" */
1727 f--;
1728 }
1729 if (p_width != NULL)
1730 *p_width = width;
1731 if (p_precision != NULL)
1732 *p_precision = precision;
1733
1734 /* Handle %ld, %lu, %lld and %llu. */
1735 longflag = 0;
1736 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001737 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001738
1739 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001740 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001741 longflag = 1;
1742 ++f;
1743 }
1744#ifdef HAVE_LONG_LONG
1745 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001746 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001747 longlongflag = 1;
1748 f += 2;
1749 }
1750#endif
1751 }
1752 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001753 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001754 size_tflag = 1;
1755 ++f;
1756 }
1757 if (p_longflag != NULL)
1758 *p_longflag = longflag;
1759 if (p_longlongflag != NULL)
1760 *p_longlongflag = longlongflag;
1761 if (p_size_tflag != NULL)
1762 *p_size_tflag = size_tflag;
1763 return f;
1764}
1765
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001766/* maximum number of characters required for output of %ld. 21 characters
1767 allows for 64-bit integers (in decimal) and an optional sign. */
1768#define MAX_LONG_CHARS 21
1769/* maximum number of characters required for output of %lld.
1770 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1771 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1772#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1773
Walter Dörwaldd2034312007-05-18 16:29:38 +00001774PyObject *
1775PyUnicode_FromFormatV(const char *format, va_list vargs)
1776{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001777 va_list count;
1778 Py_ssize_t callcount = 0;
1779 PyObject **callresults = NULL;
1780 PyObject **callresult = NULL;
1781 Py_ssize_t n = 0;
1782 int width = 0;
1783 int precision = 0;
1784 int zeropad;
1785 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001788 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1790 Py_UCS4 argmaxchar;
1791 Py_ssize_t numbersize = 0;
1792 char *numberresults = NULL;
1793 char *numberresult = NULL;
1794 Py_ssize_t i;
1795 int kind;
1796 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001797
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001798 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001799 /* step 1: count the number of %S/%R/%A/%s format specifications
1800 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1801 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 * result in an array)
1803 * also esimate a upper bound for all the number formats in the string,
1804 * numbers will be formated in step 3 and be keept in a '\0'-separated
1805 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001806 for (f = format; *f; f++) {
1807 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001808 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1810 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1811 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1812 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001814 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001815#ifdef HAVE_LONG_LONG
1816 if (longlongflag) {
1817 if (width < MAX_LONG_LONG_CHARS)
1818 width = MAX_LONG_LONG_CHARS;
1819 }
1820 else
1821#endif
1822 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1823 including sign. Decimal takes the most space. This
1824 isn't enough for octal. If a width is specified we
1825 need more (which we allocate later). */
1826 if (width < MAX_LONG_CHARS)
1827 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828
1829 /* account for the size + '\0' to separate numbers
1830 inside of the numberresults buffer */
1831 numbersize += (width + 1);
1832 }
1833 }
1834 else if ((unsigned char)*f > 127) {
1835 PyErr_Format(PyExc_ValueError,
1836 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1837 "string, got a non-ASCII byte: 0x%02x",
1838 (unsigned char)*f);
1839 return NULL;
1840 }
1841 }
1842 /* step 2: allocate memory for the results of
1843 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1844 if (callcount) {
1845 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1846 if (!callresults) {
1847 PyErr_NoMemory();
1848 return NULL;
1849 }
1850 callresult = callresults;
1851 }
1852 /* step 2.5: allocate memory for the results of formating numbers */
1853 if (numbersize) {
1854 numberresults = PyObject_Malloc(numbersize);
1855 if (!numberresults) {
1856 PyErr_NoMemory();
1857 goto fail;
1858 }
1859 numberresult = numberresults;
1860 }
1861
1862 /* step 3: format numbers and figure out how large a buffer we need */
1863 for (f = format; *f; f++) {
1864 if (*f == '%') {
1865 const char* p;
1866 int longflag;
1867 int longlongflag;
1868 int size_tflag;
1869 int numprinted;
1870
1871 p = f;
1872 zeropad = (f[1] == '0');
1873 f = parse_format_flags(f, &width, &precision,
1874 &longflag, &longlongflag, &size_tflag);
1875 switch (*f) {
1876 case 'c':
1877 {
1878 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001879 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 n++;
1881 break;
1882 }
1883 case '%':
1884 n++;
1885 break;
1886 case 'i':
1887 case 'd':
1888 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1889 width, precision, *f);
1890 if (longflag)
1891 numprinted = sprintf(numberresult, fmt,
1892 va_arg(count, long));
1893#ifdef HAVE_LONG_LONG
1894 else if (longlongflag)
1895 numprinted = sprintf(numberresult, fmt,
1896 va_arg(count, PY_LONG_LONG));
1897#endif
1898 else if (size_tflag)
1899 numprinted = sprintf(numberresult, fmt,
1900 va_arg(count, Py_ssize_t));
1901 else
1902 numprinted = sprintf(numberresult, fmt,
1903 va_arg(count, int));
1904 n += numprinted;
1905 /* advance by +1 to skip over the '\0' */
1906 numberresult += (numprinted + 1);
1907 assert(*(numberresult - 1) == '\0');
1908 assert(*(numberresult - 2) != '\0');
1909 assert(numprinted >= 0);
1910 assert(numberresult <= numberresults + numbersize);
1911 break;
1912 case 'u':
1913 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1914 width, precision, 'u');
1915 if (longflag)
1916 numprinted = sprintf(numberresult, fmt,
1917 va_arg(count, unsigned long));
1918#ifdef HAVE_LONG_LONG
1919 else if (longlongflag)
1920 numprinted = sprintf(numberresult, fmt,
1921 va_arg(count, unsigned PY_LONG_LONG));
1922#endif
1923 else if (size_tflag)
1924 numprinted = sprintf(numberresult, fmt,
1925 va_arg(count, size_t));
1926 else
1927 numprinted = sprintf(numberresult, fmt,
1928 va_arg(count, unsigned int));
1929 n += numprinted;
1930 numberresult += (numprinted + 1);
1931 assert(*(numberresult - 1) == '\0');
1932 assert(*(numberresult - 2) != '\0');
1933 assert(numprinted >= 0);
1934 assert(numberresult <= numberresults + numbersize);
1935 break;
1936 case 'x':
1937 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1938 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1939 n += numprinted;
1940 numberresult += (numprinted + 1);
1941 assert(*(numberresult - 1) == '\0');
1942 assert(*(numberresult - 2) != '\0');
1943 assert(numprinted >= 0);
1944 assert(numberresult <= numberresults + numbersize);
1945 break;
1946 case 'p':
1947 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1948 /* %p is ill-defined: ensure leading 0x. */
1949 if (numberresult[1] == 'X')
1950 numberresult[1] = 'x';
1951 else if (numberresult[1] != 'x') {
1952 memmove(numberresult + 2, numberresult,
1953 strlen(numberresult) + 1);
1954 numberresult[0] = '0';
1955 numberresult[1] = 'x';
1956 numprinted += 2;
1957 }
1958 n += numprinted;
1959 numberresult += (numprinted + 1);
1960 assert(*(numberresult - 1) == '\0');
1961 assert(*(numberresult - 2) != '\0');
1962 assert(numprinted >= 0);
1963 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001964 break;
1965 case 's':
1966 {
1967 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001968 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001969 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1970 if (!str)
1971 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 /* since PyUnicode_DecodeUTF8 returns already flexible
1973 unicode objects, there is no need to call ready on them */
1974 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001975 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001977 /* Remember the str and switch to the next slot */
1978 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001979 break;
1980 }
1981 case 'U':
1982 {
1983 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001984 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 if (PyUnicode_READY(obj) == -1)
1986 goto fail;
1987 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001988 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 break;
1991 }
1992 case 'V':
1993 {
1994 PyObject *obj = va_arg(count, PyObject *);
1995 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001996 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001997 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02001998 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001999 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 if (PyUnicode_READY(obj) == -1)
2001 goto fail;
2002 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002003 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002005 *callresult++ = NULL;
2006 }
2007 else {
2008 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2009 if (!str_obj)
2010 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002012 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002014 *callresult++ = str_obj;
2015 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002016 break;
2017 }
2018 case 'S':
2019 {
2020 PyObject *obj = va_arg(count, PyObject *);
2021 PyObject *str;
2022 assert(obj);
2023 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002025 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002027 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002029 /* Remember the str and switch to the next slot */
2030 *callresult++ = str;
2031 break;
2032 }
2033 case 'R':
2034 {
2035 PyObject *obj = va_arg(count, PyObject *);
2036 PyObject *repr;
2037 assert(obj);
2038 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002040 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002042 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002044 /* Remember the repr and switch to the next slot */
2045 *callresult++ = repr;
2046 break;
2047 }
2048 case 'A':
2049 {
2050 PyObject *obj = va_arg(count, PyObject *);
2051 PyObject *ascii;
2052 assert(obj);
2053 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002057 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002059 /* Remember the repr and switch to the next slot */
2060 *callresult++ = ascii;
2061 break;
2062 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002063 default:
2064 /* if we stumble upon an unknown
2065 formatting code, copy the rest of
2066 the format string to the output
2067 string. (we cannot just skip the
2068 code, since there's no way to know
2069 what's in the argument list) */
2070 n += strlen(p);
2071 goto expand;
2072 }
2073 } else
2074 n++;
2075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002076 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002077 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002078 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002079 we don't have to resize the string.
2080 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002082 if (!string)
2083 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 kind = PyUnicode_KIND(string);
2085 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002086 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002090 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002091 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002092
2093 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2095 /* checking for == because the last argument could be a empty
2096 string, which causes i to point to end, the assert at the end of
2097 the loop */
2098 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002099
Benjamin Peterson14339b62009-01-31 16:36:08 +00002100 switch (*f) {
2101 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002102 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103 const int ordinal = va_arg(vargs, int);
2104 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002105 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002106 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002107 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002108 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002110 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111 case 'p':
2112 /* unused, since we already have the result */
2113 if (*f == 'p')
2114 (void) va_arg(vargs, void *);
2115 else
2116 (void) va_arg(vargs, int);
2117 /* extract the result from numberresults and append. */
2118 for (; *numberresult; ++i, ++numberresult)
2119 PyUnicode_WRITE(kind, data, i, *numberresult);
2120 /* skip over the separating '\0' */
2121 assert(*numberresult == '\0');
2122 numberresult++;
2123 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002124 break;
2125 case 's':
2126 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002127 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002129 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 size = PyUnicode_GET_LENGTH(*callresult);
2131 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002132 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2133 *callresult, 0,
2134 size) < 0)
2135 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002137 /* We're done with the unicode()/repr() => forget it */
2138 Py_DECREF(*callresult);
2139 /* switch to next unicode()/repr() result */
2140 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 break;
2142 }
2143 case 'U':
2144 {
2145 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 Py_ssize_t size;
2147 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2148 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002149 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2150 obj, 0,
2151 size) < 0)
2152 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 break;
2155 }
2156 case 'V':
2157 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002159 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002160 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002161 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 size = PyUnicode_GET_LENGTH(obj);
2163 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002164 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2165 obj, 0,
2166 size) < 0)
2167 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 size = PyUnicode_GET_LENGTH(*callresult);
2171 assert(PyUnicode_KIND(*callresult) <=
2172 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002173 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2174 *callresult,
2175 0, size) < 0)
2176 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002178 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002180 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002181 break;
2182 }
2183 case 'S':
2184 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002185 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002186 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002187 /* unused, since we already have the result */
2188 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002190 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2191 *callresult, 0,
2192 PyUnicode_GET_LENGTH(*callresult)) < 0)
2193 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 /* We're done with the unicode()/repr() => forget it */
2196 Py_DECREF(*callresult);
2197 /* switch to next unicode()/repr() result */
2198 ++callresult;
2199 break;
2200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002201 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 break;
2204 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 for (; *p; ++p, ++i)
2206 PyUnicode_WRITE(kind, data, i, *p);
2207 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002208 goto end;
2209 }
Victor Stinner1205f272010-09-11 00:54:47 +00002210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 else {
2212 assert(i < PyUnicode_GET_LENGTH(string));
2213 PyUnicode_WRITE(kind, data, i++, *f);
2214 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002217
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 if (callresults)
2220 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 if (numberresults)
2222 PyObject_Free(numberresults);
2223 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002224 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002225 if (callresults) {
2226 PyObject **callresult2 = callresults;
2227 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002228 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002229 ++callresult2;
2230 }
2231 PyObject_Free(callresults);
2232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 if (numberresults)
2234 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002235 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002236}
2237
Walter Dörwaldd2034312007-05-18 16:29:38 +00002238PyObject *
2239PyUnicode_FromFormat(const char *format, ...)
2240{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002241 PyObject* ret;
2242 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002243
2244#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002246#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002248#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 ret = PyUnicode_FromFormatV(format, vargs);
2250 va_end(vargs);
2251 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002252}
2253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254#ifdef HAVE_WCHAR_H
2255
Victor Stinner5593d8a2010-10-02 11:11:27 +00002256/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2257 convert a Unicode object to a wide character string.
2258
Victor Stinnerd88d9832011-09-06 02:00:05 +02002259 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002260 character) required to convert the unicode object. Ignore size argument.
2261
Victor Stinnerd88d9832011-09-06 02:00:05 +02002262 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002263 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002264 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002265static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002266unicode_aswidechar(PyUnicodeObject *unicode,
2267 wchar_t *w,
2268 Py_ssize_t size)
2269{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002270 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 const wchar_t *wstr;
2272
2273 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2274 if (wstr == NULL)
2275 return -1;
2276
Victor Stinner5593d8a2010-10-02 11:11:27 +00002277 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002278 if (size > res)
2279 size = res + 1;
2280 else
2281 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002283 return res;
2284 }
2285 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002287}
2288
2289Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002290PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002291 wchar_t *w,
2292 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002293{
2294 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002295 PyErr_BadInternalCall();
2296 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002298 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299}
2300
Victor Stinner137c34c2010-09-29 10:25:54 +00002301wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002302PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002303 Py_ssize_t *size)
2304{
2305 wchar_t* buffer;
2306 Py_ssize_t buflen;
2307
2308 if (unicode == NULL) {
2309 PyErr_BadInternalCall();
2310 return NULL;
2311 }
2312
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002313 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 if (buflen == -1)
2315 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002316 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002317 PyErr_NoMemory();
2318 return NULL;
2319 }
2320
Victor Stinner137c34c2010-09-29 10:25:54 +00002321 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2322 if (buffer == NULL) {
2323 PyErr_NoMemory();
2324 return NULL;
2325 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002326 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 if (buflen == -1)
2328 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002329 if (size != NULL)
2330 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002331 return buffer;
2332}
2333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002335
Alexander Belopolsky40018472011-02-26 01:02:56 +00002336PyObject *
2337PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002340 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002341 PyErr_SetString(PyExc_ValueError,
2342 "chr() arg not in range(0x110000)");
2343 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002344 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002346 if (ordinal < 256)
2347 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 v = PyUnicode_New(1, ordinal);
2350 if (v == NULL)
2351 return NULL;
2352 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2353 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002354}
2355
Alexander Belopolsky40018472011-02-26 01:02:56 +00002356PyObject *
2357PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002359 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002360 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002361 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002362 if (PyUnicode_READY(obj))
2363 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 Py_INCREF(obj);
2365 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002366 }
2367 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002368 /* For a Unicode subtype that's not a Unicode object,
2369 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002370 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002371 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002372 PyErr_Format(PyExc_TypeError,
2373 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002374 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002375 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002376}
2377
Alexander Belopolsky40018472011-02-26 01:02:56 +00002378PyObject *
2379PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002380 const char *encoding,
2381 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002382{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002383 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002384 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002385
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002387 PyErr_BadInternalCall();
2388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002390
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002391 /* Decoding bytes objects is the most common case and should be fast */
2392 if (PyBytes_Check(obj)) {
2393 if (PyBytes_GET_SIZE(obj) == 0) {
2394 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002395 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002396 }
2397 else {
2398 v = PyUnicode_Decode(
2399 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2400 encoding, errors);
2401 }
2402 return v;
2403 }
2404
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002405 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002406 PyErr_SetString(PyExc_TypeError,
2407 "decoding str is not supported");
2408 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002410
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002411 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2412 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2413 PyErr_Format(PyExc_TypeError,
2414 "coercing to str: need bytes, bytearray "
2415 "or buffer-like object, %.80s found",
2416 Py_TYPE(obj)->tp_name);
2417 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002418 }
Tim Petersced69f82003-09-16 20:30:58 +00002419
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002420 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002422 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 }
Tim Petersced69f82003-09-16 20:30:58 +00002424 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002425 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002426
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002427 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002428 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429}
2430
Victor Stinner600d3be2010-06-10 12:00:55 +00002431/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002432 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2433 1 on success. */
2434static int
2435normalize_encoding(const char *encoding,
2436 char *lower,
2437 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002438{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002439 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002440 char *l;
2441 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002442
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002443 e = encoding;
2444 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002445 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002446 while (*e) {
2447 if (l == l_end)
2448 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002449 if (Py_ISUPPER(*e)) {
2450 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002451 }
2452 else if (*e == '_') {
2453 *l++ = '-';
2454 e++;
2455 }
2456 else {
2457 *l++ = *e++;
2458 }
2459 }
2460 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002461 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002462}
2463
Alexander Belopolsky40018472011-02-26 01:02:56 +00002464PyObject *
2465PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002466 Py_ssize_t size,
2467 const char *encoding,
2468 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002469{
2470 PyObject *buffer = NULL, *unicode;
2471 Py_buffer info;
2472 char lower[11]; /* Enough for any encoding shortcut */
2473
2474 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002475 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002476
2477 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002478 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002479 if ((strcmp(lower, "utf-8") == 0) ||
2480 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002481 return PyUnicode_DecodeUTF8(s, size, errors);
2482 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002483 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002484 (strcmp(lower, "iso-8859-1") == 0))
2485 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002486#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002487 else if (strcmp(lower, "mbcs") == 0)
2488 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002489#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002490 else if (strcmp(lower, "ascii") == 0)
2491 return PyUnicode_DecodeASCII(s, size, errors);
2492 else if (strcmp(lower, "utf-16") == 0)
2493 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2494 else if (strcmp(lower, "utf-32") == 0)
2495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497
2498 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002499 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002501 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002502 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 if (buffer == NULL)
2504 goto onError;
2505 unicode = PyCodec_Decode(buffer, encoding, errors);
2506 if (unicode == NULL)
2507 goto onError;
2508 if (!PyUnicode_Check(unicode)) {
2509 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002510 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002511 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 Py_DECREF(unicode);
2513 goto onError;
2514 }
2515 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 if (PyUnicode_READY(unicode)) {
2517 Py_DECREF(unicode);
2518 return NULL;
2519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002521
Benjamin Peterson29060642009-01-31 22:14:21 +00002522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 Py_XDECREF(buffer);
2524 return NULL;
2525}
2526
Alexander Belopolsky40018472011-02-26 01:02:56 +00002527PyObject *
2528PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002529 const char *encoding,
2530 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002531{
2532 PyObject *v;
2533
2534 if (!PyUnicode_Check(unicode)) {
2535 PyErr_BadArgument();
2536 goto onError;
2537 }
2538
2539 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002540 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002541
2542 /* Decode via the codec registry */
2543 v = PyCodec_Decode(unicode, encoding, errors);
2544 if (v == NULL)
2545 goto onError;
2546 return v;
2547
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002549 return NULL;
2550}
2551
Alexander Belopolsky40018472011-02-26 01:02:56 +00002552PyObject *
2553PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002554 const char *encoding,
2555 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002556{
2557 PyObject *v;
2558
2559 if (!PyUnicode_Check(unicode)) {
2560 PyErr_BadArgument();
2561 goto onError;
2562 }
2563
2564 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002565 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002566
2567 /* Decode via the codec registry */
2568 v = PyCodec_Decode(unicode, encoding, errors);
2569 if (v == NULL)
2570 goto onError;
2571 if (!PyUnicode_Check(v)) {
2572 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002573 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002574 Py_TYPE(v)->tp_name);
2575 Py_DECREF(v);
2576 goto onError;
2577 }
2578 return v;
2579
Benjamin Peterson29060642009-01-31 22:14:21 +00002580 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002581 return NULL;
2582}
2583
Alexander Belopolsky40018472011-02-26 01:02:56 +00002584PyObject *
2585PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002586 Py_ssize_t size,
2587 const char *encoding,
2588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589{
2590 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002591
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592 unicode = PyUnicode_FromUnicode(s, size);
2593 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2596 Py_DECREF(unicode);
2597 return v;
2598}
2599
Alexander Belopolsky40018472011-02-26 01:02:56 +00002600PyObject *
2601PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002602 const char *encoding,
2603 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002604{
2605 PyObject *v;
2606
2607 if (!PyUnicode_Check(unicode)) {
2608 PyErr_BadArgument();
2609 goto onError;
2610 }
2611
2612 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002613 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002614
2615 /* Encode via the codec registry */
2616 v = PyCodec_Encode(unicode, encoding, errors);
2617 if (v == NULL)
2618 goto onError;
2619 return v;
2620
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002622 return NULL;
2623}
2624
Victor Stinnerad158722010-10-27 00:25:46 +00002625PyObject *
2626PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002627{
Victor Stinner99b95382011-07-04 14:23:54 +02002628#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002629 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2630 PyUnicode_GET_SIZE(unicode),
2631 NULL);
2632#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002634#else
Victor Stinner793b5312011-04-27 00:24:21 +02002635 PyInterpreterState *interp = PyThreadState_GET()->interp;
2636 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2637 cannot use it to encode and decode filenames before it is loaded. Load
2638 the Python codec requires to encode at least its own filename. Use the C
2639 version of the locale codec until the codec registry is initialized and
2640 the Python codec is loaded.
2641
2642 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2643 cannot only rely on it: check also interp->fscodec_initialized for
2644 subinterpreters. */
2645 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002646 return PyUnicode_AsEncodedString(unicode,
2647 Py_FileSystemDefaultEncoding,
2648 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002649 }
2650 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002651 /* locale encoding with surrogateescape */
2652 wchar_t *wchar;
2653 char *bytes;
2654 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002655 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002656
2657 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2658 if (wchar == NULL)
2659 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002660 bytes = _Py_wchar2char(wchar, &error_pos);
2661 if (bytes == NULL) {
2662 if (error_pos != (size_t)-1) {
2663 char *errmsg = strerror(errno);
2664 PyObject *exc = NULL;
2665 if (errmsg == NULL)
2666 errmsg = "Py_wchar2char() failed";
2667 raise_encode_exception(&exc,
2668 "filesystemencoding",
2669 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2670 error_pos, error_pos+1,
2671 errmsg);
2672 Py_XDECREF(exc);
2673 }
2674 else
2675 PyErr_NoMemory();
2676 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002677 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002678 }
2679 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002680
2681 bytes_obj = PyBytes_FromString(bytes);
2682 PyMem_Free(bytes);
2683 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002684 }
Victor Stinnerad158722010-10-27 00:25:46 +00002685#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002686}
2687
Alexander Belopolsky40018472011-02-26 01:02:56 +00002688PyObject *
2689PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002690 const char *encoding,
2691 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692{
2693 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002694 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002695
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 if (!PyUnicode_Check(unicode)) {
2697 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
Fred Drakee4315f52000-05-09 19:53:39 +00002700
Victor Stinner2f283c22011-03-02 01:21:46 +00002701 if (encoding == NULL) {
2702 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002704 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002706 }
Fred Drakee4315f52000-05-09 19:53:39 +00002707
2708 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002709 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002710 if ((strcmp(lower, "utf-8") == 0) ||
2711 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002712 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002713 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002715 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002717 }
Victor Stinner37296e82010-06-10 13:36:23 +00002718 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002719 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002720 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002722#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002723 else if (strcmp(lower, "mbcs") == 0)
2724 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2725 PyUnicode_GET_SIZE(unicode),
2726 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002727#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002728 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731
2732 /* Encode via the codec registry */
2733 v = PyCodec_Encode(unicode, encoding, errors);
2734 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002735 return NULL;
2736
2737 /* The normal path */
2738 if (PyBytes_Check(v))
2739 return v;
2740
2741 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002742 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002743 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002744 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002745
2746 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2747 "encoder %s returned bytearray instead of bytes",
2748 encoding);
2749 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002750 Py_DECREF(v);
2751 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002752 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002753
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002754 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2755 Py_DECREF(v);
2756 return b;
2757 }
2758
2759 PyErr_Format(PyExc_TypeError,
2760 "encoder did not return a bytes object (type=%.400s)",
2761 Py_TYPE(v)->tp_name);
2762 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002763 return NULL;
2764}
2765
Alexander Belopolsky40018472011-02-26 01:02:56 +00002766PyObject *
2767PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002768 const char *encoding,
2769 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002770{
2771 PyObject *v;
2772
2773 if (!PyUnicode_Check(unicode)) {
2774 PyErr_BadArgument();
2775 goto onError;
2776 }
2777
2778 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002780
2781 /* Encode via the codec registry */
2782 v = PyCodec_Encode(unicode, encoding, errors);
2783 if (v == NULL)
2784 goto onError;
2785 if (!PyUnicode_Check(v)) {
2786 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002787 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002788 Py_TYPE(v)->tp_name);
2789 Py_DECREF(v);
2790 goto onError;
2791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002793
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 return NULL;
2796}
2797
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002798PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002799PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002800 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002801 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2802}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002803
Christian Heimes5894ba72007-11-04 11:43:14 +00002804PyObject*
2805PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2806{
Victor Stinner99b95382011-07-04 14:23:54 +02002807#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002808 return PyUnicode_DecodeMBCS(s, size, NULL);
2809#elif defined(__APPLE__)
2810 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2811#else
Victor Stinner793b5312011-04-27 00:24:21 +02002812 PyInterpreterState *interp = PyThreadState_GET()->interp;
2813 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2814 cannot use it to encode and decode filenames before it is loaded. Load
2815 the Python codec requires to encode at least its own filename. Use the C
2816 version of the locale codec until the codec registry is initialized and
2817 the Python codec is loaded.
2818
2819 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2820 cannot only rely on it: check also interp->fscodec_initialized for
2821 subinterpreters. */
2822 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002823 return PyUnicode_Decode(s, size,
2824 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002825 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002826 }
2827 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002828 /* locale encoding with surrogateescape */
2829 wchar_t *wchar;
2830 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002831 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002832
2833 if (s[size] != '\0' || size != strlen(s)) {
2834 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2835 return NULL;
2836 }
2837
Victor Stinner168e1172010-10-16 23:16:16 +00002838 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002839 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002840 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002841
Victor Stinner168e1172010-10-16 23:16:16 +00002842 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002843 PyMem_Free(wchar);
2844 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002845 }
Victor Stinnerad158722010-10-27 00:25:46 +00002846#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002847}
2848
Martin v. Löwis011e8422009-05-05 04:43:17 +00002849
2850int
2851PyUnicode_FSConverter(PyObject* arg, void* addr)
2852{
2853 PyObject *output = NULL;
2854 Py_ssize_t size;
2855 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002856 if (arg == NULL) {
2857 Py_DECREF(*(PyObject**)addr);
2858 return 1;
2859 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002860 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002861 output = arg;
2862 Py_INCREF(output);
2863 }
2864 else {
2865 arg = PyUnicode_FromObject(arg);
2866 if (!arg)
2867 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002868 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002869 Py_DECREF(arg);
2870 if (!output)
2871 return 0;
2872 if (!PyBytes_Check(output)) {
2873 Py_DECREF(output);
2874 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2875 return 0;
2876 }
2877 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002878 size = PyBytes_GET_SIZE(output);
2879 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002880 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002881 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002882 Py_DECREF(output);
2883 return 0;
2884 }
2885 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002886 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002887}
2888
2889
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002890int
2891PyUnicode_FSDecoder(PyObject* arg, void* addr)
2892{
2893 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002894 if (arg == NULL) {
2895 Py_DECREF(*(PyObject**)addr);
2896 return 1;
2897 }
2898 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 if (PyUnicode_READY(arg))
2900 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002901 output = arg;
2902 Py_INCREF(output);
2903 }
2904 else {
2905 arg = PyBytes_FromObject(arg);
2906 if (!arg)
2907 return 0;
2908 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2909 PyBytes_GET_SIZE(arg));
2910 Py_DECREF(arg);
2911 if (!output)
2912 return 0;
2913 if (!PyUnicode_Check(output)) {
2914 Py_DECREF(output);
2915 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2916 return 0;
2917 }
2918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002919 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2920 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002921 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2922 Py_DECREF(output);
2923 return 0;
2924 }
2925 *(PyObject**)addr = output;
2926 return Py_CLEANUP_SUPPORTED;
2927}
2928
2929
Martin v. Löwis5b222132007-06-10 09:51:05 +00002930char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002931PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002932{
Christian Heimesf3863112007-11-22 07:46:41 +00002933 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002934 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2935
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002936 if (!PyUnicode_Check(unicode)) {
2937 PyErr_BadArgument();
2938 return NULL;
2939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002941 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002943 if (PyUnicode_UTF8(unicode) == NULL) {
2944 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2946 if (bytes == NULL)
2947 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002948 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2949 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002950 Py_DECREF(bytes);
2951 return NULL;
2952 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002953 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2954 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 Py_DECREF(bytes);
2956 }
2957
2958 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002959 *psize = PyUnicode_UTF8_LENGTH(unicode);
2960 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002961}
2962
2963char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2967}
2968
2969#ifdef Py_DEBUG
2970int unicode_as_unicode_calls = 0;
2971#endif
2972
2973
2974Py_UNICODE *
2975PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2976{
2977 PyUnicodeObject *u;
2978 const unsigned char *one_byte;
2979#if SIZEOF_WCHAR_T == 4
2980 const Py_UCS2 *two_bytes;
2981#else
2982 const Py_UCS4 *four_bytes;
2983 const Py_UCS4 *ucs4_end;
2984 Py_ssize_t num_surrogates;
2985#endif
2986 wchar_t *w;
2987 wchar_t *wchar_end;
2988
2989 if (!PyUnicode_Check(unicode)) {
2990 PyErr_BadArgument();
2991 return NULL;
2992 }
2993 u = (PyUnicodeObject*)unicode;
2994 if (_PyUnicode_WSTR(u) == NULL) {
2995 /* Non-ASCII compact unicode object */
2996 assert(_PyUnicode_KIND(u) != 0);
2997 assert(PyUnicode_IS_READY(u));
2998
2999#ifdef Py_DEBUG
3000 ++unicode_as_unicode_calls;
3001#endif
3002
3003 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3004#if SIZEOF_WCHAR_T == 2
3005 four_bytes = PyUnicode_4BYTE_DATA(u);
3006 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3007 num_surrogates = 0;
3008
3009 for (; four_bytes < ucs4_end; ++four_bytes) {
3010 if (*four_bytes > 0xFFFF)
3011 ++num_surrogates;
3012 }
3013
3014 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3015 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3016 if (!_PyUnicode_WSTR(u)) {
3017 PyErr_NoMemory();
3018 return NULL;
3019 }
3020 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3021
3022 w = _PyUnicode_WSTR(u);
3023 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3024 four_bytes = PyUnicode_4BYTE_DATA(u);
3025 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3026 if (*four_bytes > 0xFFFF) {
3027 /* encode surrogate pair in this case */
3028 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3029 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3030 }
3031 else
3032 *w = *four_bytes;
3033
3034 if (w > wchar_end) {
3035 assert(0 && "Miscalculated string end");
3036 }
3037 }
3038 *w = 0;
3039#else
3040 /* sizeof(wchar_t) == 4 */
3041 Py_FatalError("Impossible unicode object state, wstr and str "
3042 "should share memory already.");
3043 return NULL;
3044#endif
3045 }
3046 else {
3047 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3048 (_PyUnicode_LENGTH(u) + 1));
3049 if (!_PyUnicode_WSTR(u)) {
3050 PyErr_NoMemory();
3051 return NULL;
3052 }
3053 if (!PyUnicode_IS_COMPACT_ASCII(u))
3054 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3055 w = _PyUnicode_WSTR(u);
3056 wchar_end = w + _PyUnicode_LENGTH(u);
3057
3058 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3059 one_byte = PyUnicode_1BYTE_DATA(u);
3060 for (; w < wchar_end; ++one_byte, ++w)
3061 *w = *one_byte;
3062 /* null-terminate the wstr */
3063 *w = 0;
3064 }
3065 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3066#if SIZEOF_WCHAR_T == 4
3067 two_bytes = PyUnicode_2BYTE_DATA(u);
3068 for (; w < wchar_end; ++two_bytes, ++w)
3069 *w = *two_bytes;
3070 /* null-terminate the wstr */
3071 *w = 0;
3072#else
3073 /* sizeof(wchar_t) == 2 */
3074 PyObject_FREE(_PyUnicode_WSTR(u));
3075 _PyUnicode_WSTR(u) = NULL;
3076 Py_FatalError("Impossible unicode object state, wstr "
3077 "and str should share memory already.");
3078 return NULL;
3079#endif
3080 }
3081 else {
3082 assert(0 && "This should never happen.");
3083 }
3084 }
3085 }
3086 if (size != NULL)
3087 *size = PyUnicode_WSTR_LENGTH(u);
3088 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003089}
3090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091Py_UNICODE *
3092PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003094 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003097
Alexander Belopolsky40018472011-02-26 01:02:56 +00003098Py_ssize_t
3099PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100{
3101 if (!PyUnicode_Check(unicode)) {
3102 PyErr_BadArgument();
3103 goto onError;
3104 }
3105 return PyUnicode_GET_SIZE(unicode);
3106
Benjamin Peterson29060642009-01-31 22:14:21 +00003107 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 return -1;
3109}
3110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003111Py_ssize_t
3112PyUnicode_GetLength(PyObject *unicode)
3113{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003114 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 PyErr_BadArgument();
3116 return -1;
3117 }
3118
3119 return PyUnicode_GET_LENGTH(unicode);
3120}
3121
3122Py_UCS4
3123PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3124{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003125 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3126 PyErr_BadArgument();
3127 return (Py_UCS4)-1;
3128 }
3129 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3130 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003131 return (Py_UCS4)-1;
3132 }
3133 return PyUnicode_READ_CHAR(unicode, index);
3134}
3135
3136int
3137PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3138{
3139 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003140 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 return -1;
3142 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003143 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3144 PyErr_SetString(PyExc_IndexError, "string index out of range");
3145 return -1;
3146 }
3147 if (_PyUnicode_Dirty(unicode))
3148 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003149 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3150 index, ch);
3151 return 0;
3152}
3153
Alexander Belopolsky40018472011-02-26 01:02:56 +00003154const char *
3155PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003156{
Victor Stinner42cb4622010-09-01 19:39:01 +00003157 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003158}
3159
Victor Stinner554f3f02010-06-16 23:33:54 +00003160/* create or adjust a UnicodeDecodeError */
3161static void
3162make_decode_exception(PyObject **exceptionObject,
3163 const char *encoding,
3164 const char *input, Py_ssize_t length,
3165 Py_ssize_t startpos, Py_ssize_t endpos,
3166 const char *reason)
3167{
3168 if (*exceptionObject == NULL) {
3169 *exceptionObject = PyUnicodeDecodeError_Create(
3170 encoding, input, length, startpos, endpos, reason);
3171 }
3172 else {
3173 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3174 goto onError;
3175 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3176 goto onError;
3177 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3178 goto onError;
3179 }
3180 return;
3181
3182onError:
3183 Py_DECREF(*exceptionObject);
3184 *exceptionObject = NULL;
3185}
3186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187/* error handling callback helper:
3188 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003189 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190 and adjust various state variables.
3191 return 0 on success, -1 on error
3192*/
3193
Alexander Belopolsky40018472011-02-26 01:02:56 +00003194static int
3195unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003196 const char *encoding, const char *reason,
3197 const char **input, const char **inend, Py_ssize_t *startinpos,
3198 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3199 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003200{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003201 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003202
3203 PyObject *restuple = NULL;
3204 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003205 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003206 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003207 Py_ssize_t requiredsize;
3208 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003209 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003210 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003211 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003212 int res = -1;
3213
3214 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003215 *errorHandler = PyCodec_LookupError(errors);
3216 if (*errorHandler == NULL)
3217 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 }
3219
Victor Stinner554f3f02010-06-16 23:33:54 +00003220 make_decode_exception(exceptionObject,
3221 encoding,
3222 *input, *inend - *input,
3223 *startinpos, *endinpos,
3224 reason);
3225 if (*exceptionObject == NULL)
3226 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227
3228 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3229 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003232 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 }
3235 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003237
3238 /* Copy back the bytes variables, which might have been modified by the
3239 callback */
3240 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3241 if (!inputobj)
3242 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003243 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003245 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003246 *input = PyBytes_AS_STRING(inputobj);
3247 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003248 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003249 /* we can DECREF safely, as the exception has another reference,
3250 so the object won't go away. */
3251 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003255 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3257 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003258 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259
3260 /* need more space? (at least enough for what we
3261 have+the replacement+the rest of the string (starting
3262 at the new input position), so we won't have to check space
3263 when there are no errors in the rest of the string) */
3264 repptr = PyUnicode_AS_UNICODE(repunicode);
3265 repsize = PyUnicode_GET_SIZE(repunicode);
3266 requiredsize = *outpos + repsize + insize-newpos;
3267 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 if (requiredsize<2*outsize)
3269 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003270 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 goto onError;
3272 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273 }
3274 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003275 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 Py_UNICODE_COPY(*outptr, repptr, repsize);
3277 *outptr += repsize;
3278 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003279
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003280 /* we made it! */
3281 res = 0;
3282
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 Py_XDECREF(restuple);
3285 return res;
3286}
3287
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003288/* --- UTF-7 Codec -------------------------------------------------------- */
3289
Antoine Pitrou244651a2009-05-04 18:56:13 +00003290/* See RFC2152 for details. We encode conservatively and decode liberally. */
3291
3292/* Three simple macros defining base-64. */
3293
3294/* Is c a base-64 character? */
3295
3296#define IS_BASE64(c) \
3297 (((c) >= 'A' && (c) <= 'Z') || \
3298 ((c) >= 'a' && (c) <= 'z') || \
3299 ((c) >= '0' && (c) <= '9') || \
3300 (c) == '+' || (c) == '/')
3301
3302/* given that c is a base-64 character, what is its base-64 value? */
3303
3304#define FROM_BASE64(c) \
3305 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3306 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3307 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3308 (c) == '+' ? 62 : 63)
3309
3310/* What is the base-64 character of the bottom 6 bits of n? */
3311
3312#define TO_BASE64(n) \
3313 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3314
3315/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3316 * decoded as itself. We are permissive on decoding; the only ASCII
3317 * byte not decoding to itself is the + which begins a base64
3318 * string. */
3319
3320#define DECODE_DIRECT(c) \
3321 ((c) <= 127 && (c) != '+')
3322
3323/* The UTF-7 encoder treats ASCII characters differently according to
3324 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3325 * the above). See RFC2152. This array identifies these different
3326 * sets:
3327 * 0 : "Set D"
3328 * alphanumeric and '(),-./:?
3329 * 1 : "Set O"
3330 * !"#$%&*;<=>@[]^_`{|}
3331 * 2 : "whitespace"
3332 * ht nl cr sp
3333 * 3 : special (must be base64 encoded)
3334 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3335 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003336
Tim Petersced69f82003-09-16 20:30:58 +00003337static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003338char utf7_category[128] = {
3339/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3340 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3341/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3342 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3343/* sp ! " # $ % & ' ( ) * + , - . / */
3344 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3345/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3347/* @ A B C D E F G H I J K L M N O */
3348 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3349/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3350 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3351/* ` a b c d e f g h i j k l m n o */
3352 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3353/* p q r s t u v w x y z { | } ~ del */
3354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003355};
3356
Antoine Pitrou244651a2009-05-04 18:56:13 +00003357/* ENCODE_DIRECT: this character should be encoded as itself. The
3358 * answer depends on whether we are encoding set O as itself, and also
3359 * on whether we are encoding whitespace as itself. RFC2152 makes it
3360 * clear that the answers to these questions vary between
3361 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003362
Antoine Pitrou244651a2009-05-04 18:56:13 +00003363#define ENCODE_DIRECT(c, directO, directWS) \
3364 ((c) < 128 && (c) > 0 && \
3365 ((utf7_category[(c)] == 0) || \
3366 (directWS && (utf7_category[(c)] == 2)) || \
3367 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003368
Alexander Belopolsky40018472011-02-26 01:02:56 +00003369PyObject *
3370PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003371 Py_ssize_t size,
3372 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003373{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003374 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3375}
3376
Antoine Pitrou244651a2009-05-04 18:56:13 +00003377/* The decoder. The only state we preserve is our read position,
3378 * i.e. how many characters we have consumed. So if we end in the
3379 * middle of a shift sequence we have to back off the read position
3380 * and the output to the beginning of the sequence, otherwise we lose
3381 * all the shift state (seen bits, number of bits seen, high
3382 * surrogate). */
3383
Alexander Belopolsky40018472011-02-26 01:02:56 +00003384PyObject *
3385PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003386 Py_ssize_t size,
3387 const char *errors,
3388 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003389{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003391 Py_ssize_t startinpos;
3392 Py_ssize_t endinpos;
3393 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003394 const char *e;
3395 PyUnicodeObject *unicode;
3396 Py_UNICODE *p;
3397 const char *errmsg = "";
3398 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003399 Py_UNICODE *shiftOutStart;
3400 unsigned int base64bits = 0;
3401 unsigned long base64buffer = 0;
3402 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 PyObject *errorHandler = NULL;
3404 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003405
3406 unicode = _PyUnicode_New(size);
3407 if (!unicode)
3408 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003409 if (size == 0) {
3410 if (consumed)
3411 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003412 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003413 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003416 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003417 e = s + size;
3418
3419 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003422 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003423
Antoine Pitrou244651a2009-05-04 18:56:13 +00003424 if (inShift) { /* in a base-64 section */
3425 if (IS_BASE64(ch)) { /* consume a base-64 character */
3426 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3427 base64bits += 6;
3428 s++;
3429 if (base64bits >= 16) {
3430 /* we have enough bits for a UTF-16 value */
3431 Py_UNICODE outCh = (Py_UNICODE)
3432 (base64buffer >> (base64bits-16));
3433 base64bits -= 16;
3434 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3435 if (surrogate) {
3436 /* expecting a second surrogate */
3437 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3438#ifdef Py_UNICODE_WIDE
3439 *p++ = (((surrogate & 0x3FF)<<10)
3440 | (outCh & 0x3FF)) + 0x10000;
3441#else
3442 *p++ = surrogate;
3443 *p++ = outCh;
3444#endif
3445 surrogate = 0;
3446 }
3447 else {
3448 surrogate = 0;
3449 errmsg = "second surrogate missing";
3450 goto utf7Error;
3451 }
3452 }
3453 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3454 /* first surrogate */
3455 surrogate = outCh;
3456 }
3457 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3458 errmsg = "unexpected second surrogate";
3459 goto utf7Error;
3460 }
3461 else {
3462 *p++ = outCh;
3463 }
3464 }
3465 }
3466 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003467 inShift = 0;
3468 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003469 if (surrogate) {
3470 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003471 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003473 if (base64bits > 0) { /* left-over bits */
3474 if (base64bits >= 6) {
3475 /* We've seen at least one base-64 character */
3476 errmsg = "partial character in shift sequence";
3477 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003478 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003479 else {
3480 /* Some bits remain; they should be zero */
3481 if (base64buffer != 0) {
3482 errmsg = "non-zero padding bits in shift sequence";
3483 goto utf7Error;
3484 }
3485 }
3486 }
3487 if (ch != '-') {
3488 /* '-' is absorbed; other terminating
3489 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003490 *p++ = ch;
3491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003492 }
3493 }
3494 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003496 s++; /* consume '+' */
3497 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003498 s++;
3499 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003500 }
3501 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003502 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003503 shiftOutStart = p;
3504 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003505 }
3506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003507 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003508 *p++ = ch;
3509 s++;
3510 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003511 else {
3512 startinpos = s-starts;
3513 s++;
3514 errmsg = "unexpected special character";
3515 goto utf7Error;
3516 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003517 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003518utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 outpos = p-PyUnicode_AS_UNICODE(unicode);
3520 endinpos = s-starts;
3521 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 errors, &errorHandler,
3523 "utf7", errmsg,
3524 &starts, &e, &startinpos, &endinpos, &exc, &s,
3525 &unicode, &outpos, &p))
3526 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003527 }
3528
Antoine Pitrou244651a2009-05-04 18:56:13 +00003529 /* end of string */
3530
3531 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3532 /* if we're in an inconsistent state, that's an error */
3533 if (surrogate ||
3534 (base64bits >= 6) ||
3535 (base64bits > 0 && base64buffer != 0)) {
3536 outpos = p-PyUnicode_AS_UNICODE(unicode);
3537 endinpos = size;
3538 if (unicode_decode_call_errorhandler(
3539 errors, &errorHandler,
3540 "utf7", "unterminated shift sequence",
3541 &starts, &e, &startinpos, &endinpos, &exc, &s,
3542 &unicode, &outpos, &p))
3543 goto onError;
3544 if (s < e)
3545 goto restart;
3546 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003547 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003548
3549 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003550 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003551 if (inShift) {
3552 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003553 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003554 }
3555 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003556 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003557 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003558 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003559
Victor Stinnerfe226c02011-10-03 03:52:20 +02003560 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561 goto onError;
3562
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003563 Py_XDECREF(errorHandler);
3564 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 if (PyUnicode_READY(unicode) == -1) {
3566 Py_DECREF(unicode);
3567 return NULL;
3568 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003569 return (PyObject *)unicode;
3570
Benjamin Peterson29060642009-01-31 22:14:21 +00003571 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003572 Py_XDECREF(errorHandler);
3573 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003574 Py_DECREF(unicode);
3575 return NULL;
3576}
3577
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 Py_ssize_t size,
3582 int base64SetO,
3583 int base64WhiteSpace,
3584 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003586 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003587 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003588 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003589 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003591 unsigned int base64bits = 0;
3592 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003593 char * out;
3594 char * start;
3595
3596 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003598
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003599 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003600 return PyErr_NoMemory();
3601
Antoine Pitrou244651a2009-05-04 18:56:13 +00003602 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003603 if (v == NULL)
3604 return NULL;
3605
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003606 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003607 for (;i < size; ++i) {
3608 Py_UNICODE ch = s[i];
3609
Antoine Pitrou244651a2009-05-04 18:56:13 +00003610 if (inShift) {
3611 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3612 /* shifting out */
3613 if (base64bits) { /* output remaining bits */
3614 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3615 base64buffer = 0;
3616 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003617 }
3618 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 /* Characters not in the BASE64 set implicitly unshift the sequence
3620 so no '-' is required, except if the character is itself a '-' */
3621 if (IS_BASE64(ch) || ch == '-') {
3622 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003623 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003624 *out++ = (char) ch;
3625 }
3626 else {
3627 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003628 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003629 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003630 else { /* not in a shift sequence */
3631 if (ch == '+') {
3632 *out++ = '+';
3633 *out++ = '-';
3634 }
3635 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3636 *out++ = (char) ch;
3637 }
3638 else {
3639 *out++ = '+';
3640 inShift = 1;
3641 goto encode_char;
3642 }
3643 }
3644 continue;
3645encode_char:
3646#ifdef Py_UNICODE_WIDE
3647 if (ch >= 0x10000) {
3648 /* code first surrogate */
3649 base64bits += 16;
3650 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3651 while (base64bits >= 6) {
3652 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3653 base64bits -= 6;
3654 }
3655 /* prepare second surrogate */
3656 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3657 }
3658#endif
3659 base64bits += 16;
3660 base64buffer = (base64buffer << 16) | ch;
3661 while (base64bits >= 6) {
3662 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3663 base64bits -= 6;
3664 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003665 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003666 if (base64bits)
3667 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3668 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003669 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003670 if (_PyBytes_Resize(&v, out - start) < 0)
3671 return NULL;
3672 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673}
3674
Antoine Pitrou244651a2009-05-04 18:56:13 +00003675#undef IS_BASE64
3676#undef FROM_BASE64
3677#undef TO_BASE64
3678#undef DECODE_DIRECT
3679#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681/* --- UTF-8 Codec -------------------------------------------------------- */
3682
Tim Petersced69f82003-09-16 20:30:58 +00003683static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003685 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3686 illegal prefix. See RFC 3629 for details */
3687 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3688 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003689 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3691 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3693 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003694 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3699 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3700 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3701 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3702 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703};
3704
Alexander Belopolsky40018472011-02-26 01:02:56 +00003705PyObject *
3706PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003707 Py_ssize_t size,
3708 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709{
Walter Dörwald69652032004-09-07 20:24:22 +00003710 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3711}
3712
Antoine Pitrouab868312009-01-10 15:40:25 +00003713/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3714#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3715
3716/* Mask to quickly check whether a C 'long' contains a
3717 non-ASCII, UTF8-encoded char. */
3718#if (SIZEOF_LONG == 8)
3719# define ASCII_CHAR_MASK 0x8080808080808080L
3720#elif (SIZEOF_LONG == 4)
3721# define ASCII_CHAR_MASK 0x80808080L
3722#else
3723# error C 'long' size should be either 4 or 8!
3724#endif
3725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003726/* Scans a UTF-8 string and returns the maximum character to be expected,
3727 the size of the decoded unicode string and if any major errors were
3728 encountered.
3729
3730 This function does check basic UTF-8 sanity, it does however NOT CHECK
3731 if the string contains surrogates, and if all continuation bytes are
3732 within the correct ranges, these checks are performed in
3733 PyUnicode_DecodeUTF8Stateful.
3734
3735 If it sets has_errors to 1, it means the value of unicode_size and max_char
3736 will be bogus and you should not rely on useful information in them.
3737 */
3738static Py_UCS4
3739utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3740 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3741 int *has_errors)
3742{
3743 Py_ssize_t n;
3744 Py_ssize_t char_count = 0;
3745 Py_UCS4 max_char = 127, new_max;
3746 Py_UCS4 upper_bound;
3747 const unsigned char *p = (const unsigned char *)s;
3748 const unsigned char *end = p + string_size;
3749 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3750 int err = 0;
3751
3752 for (; p < end && !err; ++p, ++char_count) {
3753 /* Only check value if it's not a ASCII char... */
3754 if (*p < 0x80) {
3755 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3756 an explanation. */
3757 if (!((size_t) p & LONG_PTR_MASK)) {
3758 /* Help register allocation */
3759 register const unsigned char *_p = p;
3760 while (_p < aligned_end) {
3761 unsigned long value = *(unsigned long *) _p;
3762 if (value & ASCII_CHAR_MASK)
3763 break;
3764 _p += SIZEOF_LONG;
3765 char_count += SIZEOF_LONG;
3766 }
3767 p = _p;
3768 if (p == end)
3769 break;
3770 }
3771 }
3772 if (*p >= 0x80) {
3773 n = utf8_code_length[*p];
3774 new_max = max_char;
3775 switch (n) {
3776 /* invalid start byte */
3777 case 0:
3778 err = 1;
3779 break;
3780 case 2:
3781 /* Code points between 0x00FF and 0x07FF inclusive.
3782 Approximate the upper bound of the code point,
3783 if this flips over 255 we can be sure it will be more
3784 than 255 and the string will need 2 bytes per code coint,
3785 if it stays under or equal to 255, we can be sure 1 byte
3786 is enough.
3787 ((*p & 0b00011111) << 6) | 0b00111111 */
3788 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3789 if (max_char < upper_bound)
3790 new_max = upper_bound;
3791 /* Ensure we track at least that we left ASCII space. */
3792 if (new_max < 128)
3793 new_max = 128;
3794 break;
3795 case 3:
3796 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3797 always > 255 and <= 65535 and will always need 2 bytes. */
3798 if (max_char < 65535)
3799 new_max = 65535;
3800 break;
3801 case 4:
3802 /* Code point will be above 0xFFFF for sure in this case. */
3803 new_max = 65537;
3804 break;
3805 /* Internal error, this should be caught by the first if */
3806 case 1:
3807 default:
3808 assert(0 && "Impossible case in utf8_max_char_and_size");
3809 err = 1;
3810 }
3811 /* Instead of number of overall bytes for this code point,
3812 n containts the number of following bytes: */
3813 --n;
3814 /* Check if the follow up chars are all valid continuation bytes */
3815 if (n >= 1) {
3816 const unsigned char *cont;
3817 if ((p + n) >= end) {
3818 if (consumed == 0)
3819 /* incomplete data, non-incremental decoding */
3820 err = 1;
3821 break;
3822 }
3823 for (cont = p + 1; cont < (p + n); ++cont) {
3824 if ((*cont & 0xc0) != 0x80) {
3825 err = 1;
3826 break;
3827 }
3828 }
3829 p += n;
3830 }
3831 else
3832 err = 1;
3833 max_char = new_max;
3834 }
3835 }
3836
3837 if (unicode_size)
3838 *unicode_size = char_count;
3839 if (has_errors)
3840 *has_errors = err;
3841 return max_char;
3842}
3843
3844/* Similar to PyUnicode_WRITE but can also write into wstr field
3845 of the legacy unicode representation */
3846#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3847 do { \
3848 const int k_ = (kind); \
3849 if (k_ == PyUnicode_WCHAR_KIND) \
3850 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3851 else if (k_ == PyUnicode_1BYTE_KIND) \
3852 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3853 else if (k_ == PyUnicode_2BYTE_KIND) \
3854 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3855 else \
3856 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3857 } while (0)
3858
Alexander Belopolsky40018472011-02-26 01:02:56 +00003859PyObject *
3860PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 Py_ssize_t size,
3862 const char *errors,
3863 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003864{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003867 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003868 Py_ssize_t startinpos;
3869 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003870 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003872 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 PyObject *errorHandler = NULL;
3874 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 Py_UCS4 maxchar = 0;
3876 Py_ssize_t unicode_size;
3877 Py_ssize_t i;
3878 int kind;
3879 void *data;
3880 int has_errors;
3881 Py_UNICODE *error_outptr;
3882#if SIZEOF_WCHAR_T == 2
3883 Py_ssize_t wchar_offset = 0;
3884#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885
Walter Dörwald69652032004-09-07 20:24:22 +00003886 if (size == 0) {
3887 if (consumed)
3888 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3892 consumed, &has_errors);
3893 if (has_errors) {
3894 unicode = _PyUnicode_New(size);
3895 if (!unicode)
3896 return NULL;
3897 kind = PyUnicode_WCHAR_KIND;
3898 data = PyUnicode_AS_UNICODE(unicode);
3899 assert(data != NULL);
3900 }
3901 else {
3902 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3903 if (!unicode)
3904 return NULL;
3905 /* When the string is ASCII only, just use memcpy and return.
3906 unicode_size may be != size if there is an incomplete UTF-8
3907 sequence at the end of the ASCII block. */
3908 if (maxchar < 128 && size == unicode_size) {
3909 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3910 return (PyObject *)unicode;
3911 }
3912 kind = PyUnicode_KIND(unicode);
3913 data = PyUnicode_DATA(unicode);
3914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003918 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919
3920 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003921 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922
3923 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003924 /* Fast path for runs of ASCII characters. Given that common UTF-8
3925 input will consist of an overwhelming majority of ASCII
3926 characters, we try to optimize for this case by checking
3927 as many characters as a C 'long' can contain.
3928 First, check if we can do an aligned read, as most CPUs have
3929 a penalty for unaligned reads.
3930 */
3931 if (!((size_t) s & LONG_PTR_MASK)) {
3932 /* Help register allocation */
3933 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003935 while (_s < aligned_end) {
3936 /* Read a whole long at a time (either 4 or 8 bytes),
3937 and do a fast unrolled copy if it only contains ASCII
3938 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 unsigned long value = *(unsigned long *) _s;
3940 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003941 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3943 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3944 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3945 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003946#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3948 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3949 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3950 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003951#endif
3952 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003954 }
3955 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003957 if (s == e)
3958 break;
3959 ch = (unsigned char)*s;
3960 }
3961 }
3962
3963 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 s++;
3966 continue;
3967 }
3968
3969 n = utf8_code_length[ch];
3970
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003971 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003972 if (consumed)
3973 break;
3974 else {
3975 errmsg = "unexpected end of data";
3976 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003977 endinpos = startinpos+1;
3978 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3979 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 goto utf8Error;
3981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983
3984 switch (n) {
3985
3986 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003987 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 startinpos = s-starts;
3989 endinpos = startinpos+1;
3990 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
3992 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003993 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 startinpos = s-starts;
3995 endinpos = startinpos+1;
3996 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997
3998 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003999 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004000 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004002 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 goto utf8Error;
4004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004006 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 break;
4009
4010 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004011 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4012 will result in surrogates in range d800-dfff. Surrogates are
4013 not valid UTF-8 so they are rejected.
4014 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4015 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004016 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004017 (s[2] & 0xc0) != 0x80 ||
4018 ((unsigned char)s[0] == 0xE0 &&
4019 (unsigned char)s[1] < 0xA0) ||
4020 ((unsigned char)s[0] == 0xED &&
4021 (unsigned char)s[1] > 0x9F)) {
4022 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004024 endinpos = startinpos + 1;
4025
4026 /* if s[1] first two bits are 1 and 0, then the invalid
4027 continuation byte is s[2], so increment endinpos by 1,
4028 if not, s[1] is invalid and endinpos doesn't need to
4029 be incremented. */
4030 if ((s[1] & 0xC0) == 0x80)
4031 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004032 goto utf8Error;
4033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004035 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004037 break;
4038
4039 case 4:
4040 if ((s[1] & 0xc0) != 0x80 ||
4041 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004042 (s[3] & 0xc0) != 0x80 ||
4043 ((unsigned char)s[0] == 0xF0 &&
4044 (unsigned char)s[1] < 0x90) ||
4045 ((unsigned char)s[0] == 0xF4 &&
4046 (unsigned char)s[1] > 0x8F)) {
4047 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004049 endinpos = startinpos + 1;
4050 if ((s[1] & 0xC0) == 0x80) {
4051 endinpos++;
4052 if ((s[2] & 0xC0) == 0x80)
4053 endinpos++;
4054 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004055 goto utf8Error;
4056 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004057 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004058 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4059 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 /* If the string is flexible or we have native UCS-4, write
4062 directly.. */
4063 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4064 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 else {
4067 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 /* translate from 10000..10FFFF to 0..FFFF */
4070 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 /* high surrogate = top 10 bits added to D800 */
4073 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4074 (Py_UNICODE)(0xD800 + (ch >> 10)));
4075
4076 /* low surrogate = bottom 10 bits added to DC00 */
4077 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4078 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4079 }
4080#if SIZEOF_WCHAR_T == 2
4081 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004082#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 }
4085 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004087
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089 /* If this is not yet a resizable string, make it one.. */
4090 if (kind != PyUnicode_WCHAR_KIND) {
4091 const Py_UNICODE *u;
4092 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4093 if (!new_unicode)
4094 goto onError;
4095 u = PyUnicode_AsUnicode((PyObject *)unicode);
4096 if (!u)
4097 goto onError;
4098#if SIZEOF_WCHAR_T == 2
4099 i += wchar_offset;
4100#endif
4101 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4102 Py_DECREF(unicode);
4103 unicode = new_unicode;
4104 kind = 0;
4105 data = PyUnicode_AS_UNICODE(new_unicode);
4106 assert(data != NULL);
4107 }
4108 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 if (unicode_decode_call_errorhandler(
4110 errors, &errorHandler,
4111 "utf8", errmsg,
4112 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 /* Update data because unicode_decode_call_errorhandler might have
4116 re-created or resized the unicode object. */
4117 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 /* Ensure the unicode_size calculation above was correct: */
4121 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4122
Walter Dörwald69652032004-09-07 20:24:22 +00004123 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 /* Adjust length and ready string when it contained errors and
4127 is of the old resizable kind. */
4128 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004129 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004130 PyUnicode_READY(unicode) == -1)
4131 goto onError;
4132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 Py_XDECREF(errorHandler);
4135 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 if (PyUnicode_READY(unicode) == -1) {
4137 Py_DECREF(unicode);
4138 return NULL;
4139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 return (PyObject *)unicode;
4141
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 Py_XDECREF(errorHandler);
4144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 Py_DECREF(unicode);
4146 return NULL;
4147}
4148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004150
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004151#ifdef __APPLE__
4152
4153/* Simplified UTF-8 decoder using surrogateescape error handler,
4154 used to decode the command line arguments on Mac OS X. */
4155
4156wchar_t*
4157_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4158{
4159 int n;
4160 const char *e;
4161 wchar_t *unicode, *p;
4162
4163 /* Note: size will always be longer than the resulting Unicode
4164 character count */
4165 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4166 PyErr_NoMemory();
4167 return NULL;
4168 }
4169 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4170 if (!unicode)
4171 return NULL;
4172
4173 /* Unpack UTF-8 encoded data */
4174 p = unicode;
4175 e = s + size;
4176 while (s < e) {
4177 Py_UCS4 ch = (unsigned char)*s;
4178
4179 if (ch < 0x80) {
4180 *p++ = (wchar_t)ch;
4181 s++;
4182 continue;
4183 }
4184
4185 n = utf8_code_length[ch];
4186 if (s + n > e) {
4187 goto surrogateescape;
4188 }
4189
4190 switch (n) {
4191 case 0:
4192 case 1:
4193 goto surrogateescape;
4194
4195 case 2:
4196 if ((s[1] & 0xc0) != 0x80)
4197 goto surrogateescape;
4198 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4199 assert ((ch > 0x007F) && (ch <= 0x07FF));
4200 *p++ = (wchar_t)ch;
4201 break;
4202
4203 case 3:
4204 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4205 will result in surrogates in range d800-dfff. Surrogates are
4206 not valid UTF-8 so they are rejected.
4207 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4208 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4209 if ((s[1] & 0xc0) != 0x80 ||
4210 (s[2] & 0xc0) != 0x80 ||
4211 ((unsigned char)s[0] == 0xE0 &&
4212 (unsigned char)s[1] < 0xA0) ||
4213 ((unsigned char)s[0] == 0xED &&
4214 (unsigned char)s[1] > 0x9F)) {
4215
4216 goto surrogateescape;
4217 }
4218 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4219 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004220 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004221 break;
4222
4223 case 4:
4224 if ((s[1] & 0xc0) != 0x80 ||
4225 (s[2] & 0xc0) != 0x80 ||
4226 (s[3] & 0xc0) != 0x80 ||
4227 ((unsigned char)s[0] == 0xF0 &&
4228 (unsigned char)s[1] < 0x90) ||
4229 ((unsigned char)s[0] == 0xF4 &&
4230 (unsigned char)s[1] > 0x8F)) {
4231 goto surrogateescape;
4232 }
4233 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4234 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4235 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4236
4237#if SIZEOF_WCHAR_T == 4
4238 *p++ = (wchar_t)ch;
4239#else
4240 /* compute and append the two surrogates: */
4241
4242 /* translate from 10000..10FFFF to 0..FFFF */
4243 ch -= 0x10000;
4244
4245 /* high surrogate = top 10 bits added to D800 */
4246 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4247
4248 /* low surrogate = bottom 10 bits added to DC00 */
4249 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4250#endif
4251 break;
4252 }
4253 s += n;
4254 continue;
4255
4256 surrogateescape:
4257 *p++ = 0xDC00 + ch;
4258 s++;
4259 }
4260 *p = L'\0';
4261 return unicode;
4262}
4263
4264#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266/* Primary internal function which creates utf8 encoded bytes objects.
4267
4268 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004269 and allocate exactly as much space needed at the end. Else allocate the
4270 maximum possible needed (4 result bytes per Unicode character), and return
4271 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004272*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004273PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275{
Tim Peters602f7402002-04-27 18:03:26 +00004276#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004277
Guido van Rossum98297ee2007-11-06 21:34:58 +00004278 Py_ssize_t i; /* index into s of next input byte */
4279 PyObject *result; /* result string object */
4280 char *p; /* next free byte in output buffer */
4281 Py_ssize_t nallocated; /* number of result bytes allocated */
4282 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004283 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004284 PyObject *errorHandler = NULL;
4285 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 int kind;
4287 void *data;
4288 Py_ssize_t size;
4289 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4290#if SIZEOF_WCHAR_T == 2
4291 Py_ssize_t wchar_offset = 0;
4292#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 if (!PyUnicode_Check(unicode)) {
4295 PyErr_BadArgument();
4296 return NULL;
4297 }
4298
4299 if (PyUnicode_READY(unicode) == -1)
4300 return NULL;
4301
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004302 if (PyUnicode_UTF8(unicode))
4303 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4304 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004305
4306 kind = PyUnicode_KIND(unicode);
4307 data = PyUnicode_DATA(unicode);
4308 size = PyUnicode_GET_LENGTH(unicode);
4309
Tim Peters602f7402002-04-27 18:03:26 +00004310 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311
Tim Peters602f7402002-04-27 18:03:26 +00004312 if (size <= MAX_SHORT_UNICHARS) {
4313 /* Write into the stack buffer; nallocated can't overflow.
4314 * At the end, we'll allocate exactly as much heap space as it
4315 * turns out we need.
4316 */
4317 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004318 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004319 p = stackbuf;
4320 }
4321 else {
4322 /* Overallocate on the heap, and give the excess back at the end. */
4323 nallocated = size * 4;
4324 if (nallocated / 4 != size) /* overflow! */
4325 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004326 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004327 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004328 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004329 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004330 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004331
Tim Peters602f7402002-04-27 18:03:26 +00004332 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004333 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004334
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004335 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004336 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004340 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004341 *p++ = (char)(0xc0 | (ch >> 6));
4342 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004343 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 Py_ssize_t newpos;
4345 PyObject *rep;
4346 Py_ssize_t repsize, k, startpos;
4347 startpos = i-1;
4348#if SIZEOF_WCHAR_T == 2
4349 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004350#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004351 rep = unicode_encode_call_errorhandler(
4352 errors, &errorHandler, "utf-8", "surrogates not allowed",
4353 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4354 &exc, startpos, startpos+1, &newpos);
4355 if (!rep)
4356 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 if (PyBytes_Check(rep))
4359 repsize = PyBytes_GET_SIZE(rep);
4360 else
4361 repsize = PyUnicode_GET_SIZE(rep);
4362
4363 if (repsize > 4) {
4364 Py_ssize_t offset;
4365
4366 if (result == NULL)
4367 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004371 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4372 /* integer overflow */
4373 PyErr_NoMemory();
4374 goto error;
4375 }
4376 nallocated += repsize - 4;
4377 if (result != NULL) {
4378 if (_PyBytes_Resize(&result, nallocated) < 0)
4379 goto error;
4380 } else {
4381 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004382 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 goto error;
4384 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4385 }
4386 p = PyBytes_AS_STRING(result) + offset;
4387 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004389 if (PyBytes_Check(rep)) {
4390 char *prep = PyBytes_AS_STRING(rep);
4391 for(k = repsize; k > 0; k--)
4392 *p++ = *prep++;
4393 } else /* rep is unicode */ {
4394 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4395 Py_UNICODE c;
4396
4397 for(k=0; k<repsize; k++) {
4398 c = prep[k];
4399 if (0x80 <= c) {
4400 raise_encode_exception(&exc, "utf-8",
4401 PyUnicode_AS_UNICODE(unicode),
4402 size, i-1, i,
4403 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004404 goto error;
4405 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004406 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004407 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004410 } else if (ch < 0x10000) {
4411 *p++ = (char)(0xe0 | (ch >> 12));
4412 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4413 *p++ = (char)(0x80 | (ch & 0x3f));
4414 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004415 /* Encode UCS4 Unicode ordinals */
4416 *p++ = (char)(0xf0 | (ch >> 18));
4417 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4418 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4419 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420#if SIZEOF_WCHAR_T == 2
4421 wchar_offset++;
4422#endif
Tim Peters602f7402002-04-27 18:03:26 +00004423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004425
Guido van Rossum98297ee2007-11-06 21:34:58 +00004426 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004427 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004428 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004429 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004430 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004431 }
4432 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004433 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004434 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004435 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004436 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004438
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004439 Py_XDECREF(errorHandler);
4440 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004441 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004442 error:
4443 Py_XDECREF(errorHandler);
4444 Py_XDECREF(exc);
4445 Py_XDECREF(result);
4446 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004447
Tim Peters602f7402002-04-27 18:03:26 +00004448#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449}
4450
Alexander Belopolsky40018472011-02-26 01:02:56 +00004451PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004452PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4453 Py_ssize_t size,
4454 const char *errors)
4455{
4456 PyObject *v, *unicode;
4457
4458 unicode = PyUnicode_FromUnicode(s, size);
4459 if (unicode == NULL)
4460 return NULL;
4461 v = _PyUnicode_AsUTF8String(unicode, errors);
4462 Py_DECREF(unicode);
4463 return v;
4464}
4465
4466PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004467PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004469 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470}
4471
Walter Dörwald41980ca2007-08-16 21:55:45 +00004472/* --- UTF-32 Codec ------------------------------------------------------- */
4473
4474PyObject *
4475PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 Py_ssize_t size,
4477 const char *errors,
4478 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004479{
4480 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4481}
4482
4483PyObject *
4484PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 Py_ssize_t size,
4486 const char *errors,
4487 int *byteorder,
4488 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004489{
4490 const char *starts = s;
4491 Py_ssize_t startinpos;
4492 Py_ssize_t endinpos;
4493 Py_ssize_t outpos;
4494 PyUnicodeObject *unicode;
4495 Py_UNICODE *p;
4496#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004497 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004498 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004499#else
4500 const int pairs = 0;
4501#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004502 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004503 int bo = 0; /* assume native ordering by default */
4504 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004505 /* Offsets from q for retrieving bytes in the right order. */
4506#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4507 int iorder[] = {0, 1, 2, 3};
4508#else
4509 int iorder[] = {3, 2, 1, 0};
4510#endif
4511 PyObject *errorHandler = NULL;
4512 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004513
Walter Dörwald41980ca2007-08-16 21:55:45 +00004514 q = (unsigned char *)s;
4515 e = q + size;
4516
4517 if (byteorder)
4518 bo = *byteorder;
4519
4520 /* Check for BOM marks (U+FEFF) in the input and adjust current
4521 byte order setting accordingly. In native mode, the leading BOM
4522 mark is skipped, in all other modes, it is copied to the output
4523 stream as-is (giving a ZWNBSP character). */
4524 if (bo == 0) {
4525 if (size >= 4) {
4526 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004528#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 if (bom == 0x0000FEFF) {
4530 q += 4;
4531 bo = -1;
4532 }
4533 else if (bom == 0xFFFE0000) {
4534 q += 4;
4535 bo = 1;
4536 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004537#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 if (bom == 0x0000FEFF) {
4539 q += 4;
4540 bo = 1;
4541 }
4542 else if (bom == 0xFFFE0000) {
4543 q += 4;
4544 bo = -1;
4545 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004546#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004548 }
4549
4550 if (bo == -1) {
4551 /* force LE */
4552 iorder[0] = 0;
4553 iorder[1] = 1;
4554 iorder[2] = 2;
4555 iorder[3] = 3;
4556 }
4557 else if (bo == 1) {
4558 /* force BE */
4559 iorder[0] = 3;
4560 iorder[1] = 2;
4561 iorder[2] = 1;
4562 iorder[3] = 0;
4563 }
4564
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004565 /* On narrow builds we split characters outside the BMP into two
4566 codepoints => count how much extra space we need. */
4567#ifndef Py_UNICODE_WIDE
4568 for (qq = q; qq < e; qq += 4)
4569 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4570 pairs++;
4571#endif
4572
4573 /* This might be one to much, because of a BOM */
4574 unicode = _PyUnicode_New((size+3)/4+pairs);
4575 if (!unicode)
4576 return NULL;
4577 if (size == 0)
4578 return (PyObject *)unicode;
4579
4580 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004581 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004582
Walter Dörwald41980ca2007-08-16 21:55:45 +00004583 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 Py_UCS4 ch;
4585 /* remaining bytes at the end? (size should be divisible by 4) */
4586 if (e-q<4) {
4587 if (consumed)
4588 break;
4589 errmsg = "truncated data";
4590 startinpos = ((const char *)q)-starts;
4591 endinpos = ((const char *)e)-starts;
4592 goto utf32Error;
4593 /* The remaining input chars are ignored if the callback
4594 chooses to skip the input */
4595 }
4596 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4597 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004598
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 if (ch >= 0x110000)
4600 {
4601 errmsg = "codepoint not in range(0x110000)";
4602 startinpos = ((const char *)q)-starts;
4603 endinpos = startinpos+4;
4604 goto utf32Error;
4605 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004606#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 if (ch >= 0x10000)
4608 {
4609 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4610 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4611 }
4612 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004613#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 *p++ = ch;
4615 q += 4;
4616 continue;
4617 utf32Error:
4618 outpos = p-PyUnicode_AS_UNICODE(unicode);
4619 if (unicode_decode_call_errorhandler(
4620 errors, &errorHandler,
4621 "utf32", errmsg,
4622 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4623 &unicode, &outpos, &p))
4624 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004625 }
4626
4627 if (byteorder)
4628 *byteorder = bo;
4629
4630 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004632
4633 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004634 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635 goto onError;
4636
4637 Py_XDECREF(errorHandler);
4638 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639 if (PyUnicode_READY(unicode) == -1) {
4640 Py_DECREF(unicode);
4641 return NULL;
4642 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004643 return (PyObject *)unicode;
4644
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004646 Py_DECREF(unicode);
4647 Py_XDECREF(errorHandler);
4648 Py_XDECREF(exc);
4649 return NULL;
4650}
4651
4652PyObject *
4653PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004654 Py_ssize_t size,
4655 const char *errors,
4656 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004657{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004658 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004659 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004660 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004661#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004662 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004663#else
4664 const int pairs = 0;
4665#endif
4666 /* Offsets from p for storing byte pairs in the right order. */
4667#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4668 int iorder[] = {0, 1, 2, 3};
4669#else
4670 int iorder[] = {3, 2, 1, 0};
4671#endif
4672
Benjamin Peterson29060642009-01-31 22:14:21 +00004673#define STORECHAR(CH) \
4674 do { \
4675 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4676 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4677 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4678 p[iorder[0]] = (CH) & 0xff; \
4679 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004680 } while(0)
4681
4682 /* In narrow builds we can output surrogate pairs as one codepoint,
4683 so we need less space. */
4684#ifndef Py_UNICODE_WIDE
4685 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4687 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4688 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004689#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004690 nsize = (size - pairs + (byteorder == 0));
4691 bytesize = nsize * 4;
4692 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004694 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004695 if (v == NULL)
4696 return NULL;
4697
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004698 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004699 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004701 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004702 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004703
4704 if (byteorder == -1) {
4705 /* force LE */
4706 iorder[0] = 0;
4707 iorder[1] = 1;
4708 iorder[2] = 2;
4709 iorder[3] = 3;
4710 }
4711 else if (byteorder == 1) {
4712 /* force BE */
4713 iorder[0] = 3;
4714 iorder[1] = 2;
4715 iorder[2] = 1;
4716 iorder[3] = 0;
4717 }
4718
4719 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004721#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4723 Py_UCS4 ch2 = *s;
4724 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4725 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4726 s++;
4727 size--;
4728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004729 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004730#endif
4731 STORECHAR(ch);
4732 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004733
4734 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004735 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004736#undef STORECHAR
4737}
4738
Alexander Belopolsky40018472011-02-26 01:02:56 +00004739PyObject *
4740PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004741{
4742 if (!PyUnicode_Check(unicode)) {
4743 PyErr_BadArgument();
4744 return NULL;
4745 }
4746 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004747 PyUnicode_GET_SIZE(unicode),
4748 NULL,
4749 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004750}
4751
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752/* --- UTF-16 Codec ------------------------------------------------------- */
4753
Tim Peters772747b2001-08-09 22:21:55 +00004754PyObject *
4755PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004756 Py_ssize_t size,
4757 const char *errors,
4758 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759{
Walter Dörwald69652032004-09-07 20:24:22 +00004760 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4761}
4762
Antoine Pitrouab868312009-01-10 15:40:25 +00004763/* Two masks for fast checking of whether a C 'long' may contain
4764 UTF16-encoded surrogate characters. This is an efficient heuristic,
4765 assuming that non-surrogate characters with a code point >= 0x8000 are
4766 rare in most input.
4767 FAST_CHAR_MASK is used when the input is in native byte ordering,
4768 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004769*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004770#if (SIZEOF_LONG == 8)
4771# define FAST_CHAR_MASK 0x8000800080008000L
4772# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4773#elif (SIZEOF_LONG == 4)
4774# define FAST_CHAR_MASK 0x80008000L
4775# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4776#else
4777# error C 'long' size should be either 4 or 8!
4778#endif
4779
Walter Dörwald69652032004-09-07 20:24:22 +00004780PyObject *
4781PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004782 Py_ssize_t size,
4783 const char *errors,
4784 int *byteorder,
4785 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004786{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004788 Py_ssize_t startinpos;
4789 Py_ssize_t endinpos;
4790 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791 PyUnicodeObject *unicode;
4792 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004793 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004794 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004795 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004796 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004797 /* Offsets from q for retrieving byte pairs in the right order. */
4798#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4799 int ihi = 1, ilo = 0;
4800#else
4801 int ihi = 0, ilo = 1;
4802#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803 PyObject *errorHandler = NULL;
4804 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805
4806 /* Note: size will always be longer than the resulting Unicode
4807 character count */
4808 unicode = _PyUnicode_New(size);
4809 if (!unicode)
4810 return NULL;
4811 if (size == 0)
4812 return (PyObject *)unicode;
4813
4814 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004816 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004817 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818
4819 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004820 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004822 /* Check for BOM marks (U+FEFF) in the input and adjust current
4823 byte order setting accordingly. In native mode, the leading BOM
4824 mark is skipped, in all other modes, it is copied to the output
4825 stream as-is (giving a ZWNBSP character). */
4826 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004827 if (size >= 2) {
4828 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004829#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 if (bom == 0xFEFF) {
4831 q += 2;
4832 bo = -1;
4833 }
4834 else if (bom == 0xFFFE) {
4835 q += 2;
4836 bo = 1;
4837 }
Tim Petersced69f82003-09-16 20:30:58 +00004838#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 if (bom == 0xFEFF) {
4840 q += 2;
4841 bo = 1;
4842 }
4843 else if (bom == 0xFFFE) {
4844 q += 2;
4845 bo = -1;
4846 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004847#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850
Tim Peters772747b2001-08-09 22:21:55 +00004851 if (bo == -1) {
4852 /* force LE */
4853 ihi = 1;
4854 ilo = 0;
4855 }
4856 else if (bo == 1) {
4857 /* force BE */
4858 ihi = 0;
4859 ilo = 1;
4860 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004861#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4862 native_ordering = ilo < ihi;
4863#else
4864 native_ordering = ilo > ihi;
4865#endif
Tim Peters772747b2001-08-09 22:21:55 +00004866
Antoine Pitrouab868312009-01-10 15:40:25 +00004867 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004868 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004870 /* First check for possible aligned read of a C 'long'. Unaligned
4871 reads are more expensive, better to defer to another iteration. */
4872 if (!((size_t) q & LONG_PTR_MASK)) {
4873 /* Fast path for runs of non-surrogate chars. */
4874 register const unsigned char *_q = q;
4875 Py_UNICODE *_p = p;
4876 if (native_ordering) {
4877 /* Native ordering is simple: as long as the input cannot
4878 possibly contain a surrogate char, do an unrolled copy
4879 of several 16-bit code points to the target object.
4880 The non-surrogate check is done on several input bytes
4881 at a time (as many as a C 'long' can contain). */
4882 while (_q < aligned_end) {
4883 unsigned long data = * (unsigned long *) _q;
4884 if (data & FAST_CHAR_MASK)
4885 break;
4886 _p[0] = ((unsigned short *) _q)[0];
4887 _p[1] = ((unsigned short *) _q)[1];
4888#if (SIZEOF_LONG == 8)
4889 _p[2] = ((unsigned short *) _q)[2];
4890 _p[3] = ((unsigned short *) _q)[3];
4891#endif
4892 _q += SIZEOF_LONG;
4893 _p += SIZEOF_LONG / 2;
4894 }
4895 }
4896 else {
4897 /* Byteswapped ordering is similar, but we must decompose
4898 the copy bytewise, and take care of zero'ing out the
4899 upper bytes if the target object is in 32-bit units
4900 (that is, in UCS-4 builds). */
4901 while (_q < aligned_end) {
4902 unsigned long data = * (unsigned long *) _q;
4903 if (data & SWAPPED_FAST_CHAR_MASK)
4904 break;
4905 /* Zero upper bytes in UCS-4 builds */
4906#if (Py_UNICODE_SIZE > 2)
4907 _p[0] = 0;
4908 _p[1] = 0;
4909#if (SIZEOF_LONG == 8)
4910 _p[2] = 0;
4911 _p[3] = 0;
4912#endif
4913#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004914 /* Issue #4916; UCS-4 builds on big endian machines must
4915 fill the two last bytes of each 4-byte unit. */
4916#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4917# define OFF 2
4918#else
4919# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004920#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004921 ((unsigned char *) _p)[OFF + 1] = _q[0];
4922 ((unsigned char *) _p)[OFF + 0] = _q[1];
4923 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4924 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4925#if (SIZEOF_LONG == 8)
4926 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4927 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4928 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4929 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4930#endif
4931#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004932 _q += SIZEOF_LONG;
4933 _p += SIZEOF_LONG / 2;
4934 }
4935 }
4936 p = _p;
4937 q = _q;
4938 if (q >= e)
4939 break;
4940 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942
Benjamin Peterson14339b62009-01-31 16:36:08 +00004943 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004944
4945 if (ch < 0xD800 || ch > 0xDFFF) {
4946 *p++ = ch;
4947 continue;
4948 }
4949
4950 /* UTF-16 code pair: */
4951 if (q > e) {
4952 errmsg = "unexpected end of data";
4953 startinpos = (((const char *)q) - 2) - starts;
4954 endinpos = ((const char *)e) + 1 - starts;
4955 goto utf16Error;
4956 }
4957 if (0xD800 <= ch && ch <= 0xDBFF) {
4958 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4959 q += 2;
4960 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004961#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 *p++ = ch;
4963 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004964#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004966#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 continue;
4968 }
4969 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004970 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 startinpos = (((const char *)q)-4)-starts;
4972 endinpos = startinpos+2;
4973 goto utf16Error;
4974 }
4975
Benjamin Peterson14339b62009-01-31 16:36:08 +00004976 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 errmsg = "illegal encoding";
4978 startinpos = (((const char *)q)-2)-starts;
4979 endinpos = startinpos+2;
4980 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004981
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 utf16Error:
4983 outpos = p - PyUnicode_AS_UNICODE(unicode);
4984 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004985 errors,
4986 &errorHandler,
4987 "utf16", errmsg,
4988 &starts,
4989 (const char **)&e,
4990 &startinpos,
4991 &endinpos,
4992 &exc,
4993 (const char **)&q,
4994 &unicode,
4995 &outpos,
4996 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004999 /* remaining byte at the end? (size should be even) */
5000 if (e == q) {
5001 if (!consumed) {
5002 errmsg = "truncated data";
5003 startinpos = ((const char *)q) - starts;
5004 endinpos = ((const char *)e) + 1 - starts;
5005 outpos = p - PyUnicode_AS_UNICODE(unicode);
5006 if (unicode_decode_call_errorhandler(
5007 errors,
5008 &errorHandler,
5009 "utf16", errmsg,
5010 &starts,
5011 (const char **)&e,
5012 &startinpos,
5013 &endinpos,
5014 &exc,
5015 (const char **)&q,
5016 &unicode,
5017 &outpos,
5018 &p))
5019 goto onError;
5020 /* The remaining input chars are ignored if the callback
5021 chooses to skip the input */
5022 }
5023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
5025 if (byteorder)
5026 *byteorder = bo;
5027
Walter Dörwald69652032004-09-07 20:24:22 +00005028 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005030
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005032 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 goto onError;
5034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005035 Py_XDECREF(errorHandler);
5036 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005037 if (PyUnicode_READY(unicode) == -1) {
5038 Py_DECREF(unicode);
5039 return NULL;
5040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041 return (PyObject *)unicode;
5042
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005045 Py_XDECREF(errorHandler);
5046 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 return NULL;
5048}
5049
Antoine Pitrouab868312009-01-10 15:40:25 +00005050#undef FAST_CHAR_MASK
5051#undef SWAPPED_FAST_CHAR_MASK
5052
Tim Peters772747b2001-08-09 22:21:55 +00005053PyObject *
5054PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 Py_ssize_t size,
5056 const char *errors,
5057 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005059 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005060 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005061 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005062#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005063 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005064#else
5065 const int pairs = 0;
5066#endif
Tim Peters772747b2001-08-09 22:21:55 +00005067 /* Offsets from p for storing byte pairs in the right order. */
5068#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5069 int ihi = 1, ilo = 0;
5070#else
5071 int ihi = 0, ilo = 1;
5072#endif
5073
Benjamin Peterson29060642009-01-31 22:14:21 +00005074#define STORECHAR(CH) \
5075 do { \
5076 p[ihi] = ((CH) >> 8) & 0xff; \
5077 p[ilo] = (CH) & 0xff; \
5078 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005079 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005081#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005082 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 if (s[i] >= 0x10000)
5084 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005085#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005086 /* 2 * (size + pairs + (byteorder == 0)) */
5087 if (size > PY_SSIZE_T_MAX ||
5088 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005090 nsize = size + pairs + (byteorder == 0);
5091 bytesize = nsize * 2;
5092 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005094 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 if (v == NULL)
5096 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005098 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005101 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005102 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005103
5104 if (byteorder == -1) {
5105 /* force LE */
5106 ihi = 1;
5107 ilo = 0;
5108 }
5109 else if (byteorder == 1) {
5110 /* force BE */
5111 ihi = 0;
5112 ilo = 1;
5113 }
5114
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005115 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 Py_UNICODE ch = *s++;
5117 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005118#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 if (ch >= 0x10000) {
5120 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5121 ch = 0xD800 | ((ch-0x10000) >> 10);
5122 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005123#endif
Tim Peters772747b2001-08-09 22:21:55 +00005124 STORECHAR(ch);
5125 if (ch2)
5126 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005127 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005128
5129 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005130 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005131#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132}
5133
Alexander Belopolsky40018472011-02-26 01:02:56 +00005134PyObject *
5135PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136{
5137 if (!PyUnicode_Check(unicode)) {
5138 PyErr_BadArgument();
5139 return NULL;
5140 }
5141 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 PyUnicode_GET_SIZE(unicode),
5143 NULL,
5144 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145}
5146
5147/* --- Unicode Escape Codec ----------------------------------------------- */
5148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005149/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5150 if all the escapes in the string make it still a valid ASCII string.
5151 Returns -1 if any escapes were found which cause the string to
5152 pop out of ASCII range. Otherwise returns the length of the
5153 required buffer to hold the string.
5154 */
5155Py_ssize_t
5156length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5157{
5158 const unsigned char *p = (const unsigned char *)s;
5159 const unsigned char *end = p + size;
5160 Py_ssize_t length = 0;
5161
5162 if (size < 0)
5163 return -1;
5164
5165 for (; p < end; ++p) {
5166 if (*p > 127) {
5167 /* Non-ASCII */
5168 return -1;
5169 }
5170 else if (*p != '\\') {
5171 /* Normal character */
5172 ++length;
5173 }
5174 else {
5175 /* Backslash-escape, check next char */
5176 ++p;
5177 /* Escape sequence reaches till end of string or
5178 non-ASCII follow-up. */
5179 if (p >= end || *p > 127)
5180 return -1;
5181 switch (*p) {
5182 case '\n':
5183 /* backslash + \n result in zero characters */
5184 break;
5185 case '\\': case '\'': case '\"':
5186 case 'b': case 'f': case 't':
5187 case 'n': case 'r': case 'v': case 'a':
5188 ++length;
5189 break;
5190 case '0': case '1': case '2': case '3':
5191 case '4': case '5': case '6': case '7':
5192 case 'x': case 'u': case 'U': case 'N':
5193 /* these do not guarantee ASCII characters */
5194 return -1;
5195 default:
5196 /* count the backslash + the other character */
5197 length += 2;
5198 }
5199 }
5200 }
5201 return length;
5202}
5203
5204/* Similar to PyUnicode_WRITE but either write into wstr field
5205 or treat string as ASCII. */
5206#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5207 do { \
5208 if ((kind) != PyUnicode_WCHAR_KIND) \
5209 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5210 else \
5211 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5212 } while (0)
5213
5214#define WRITE_WSTR(buf, index, value) \
5215 assert(kind == PyUnicode_WCHAR_KIND), \
5216 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5217
5218
Fredrik Lundh06d12682001-01-24 07:59:11 +00005219static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005220
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyObject *
5222PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005223 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005224 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005227 Py_ssize_t startinpos;
5228 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005229 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005233 char* message;
5234 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 PyObject *errorHandler = NULL;
5236 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005237 Py_ssize_t ascii_length;
5238 Py_ssize_t i;
5239 int kind;
5240 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005242 ascii_length = length_of_escaped_ascii_string(s, size);
5243
5244 /* After length_of_escaped_ascii_string() there are two alternatives,
5245 either the string is pure ASCII with named escapes like \n, etc.
5246 and we determined it's exact size (common case)
5247 or it contains \x, \u, ... escape sequences. then we create a
5248 legacy wchar string and resize it at the end of this function. */
5249 if (ascii_length >= 0) {
5250 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5251 if (!v)
5252 goto onError;
5253 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5254 kind = PyUnicode_1BYTE_KIND;
5255 data = PyUnicode_DATA(v);
5256 }
5257 else {
5258 /* Escaped strings will always be longer than the resulting
5259 Unicode string, so we start with size here and then reduce the
5260 length after conversion to the true value.
5261 (but if the error callback returns a long replacement string
5262 we'll have to allocate more space) */
5263 v = _PyUnicode_New(size);
5264 if (!v)
5265 goto onError;
5266 kind = PyUnicode_WCHAR_KIND;
5267 data = PyUnicode_AS_UNICODE(v);
5268 }
5269
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 if (size == 0)
5271 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005272 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005274
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 while (s < end) {
5276 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005277 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005278 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005280 if (kind == PyUnicode_WCHAR_KIND) {
5281 assert(i < _PyUnicode_WSTR_LENGTH(v));
5282 }
5283 else {
5284 /* The only case in which i == ascii_length is a backslash
5285 followed by a newline. */
5286 assert(i <= ascii_length);
5287 }
5288
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 /* Non-escape characters are interpreted as Unicode ordinals */
5290 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005291 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 continue;
5293 }
5294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005295 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 /* \ - Escapes */
5297 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005298 c = *s++;
5299 if (s > end)
5300 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005301
5302 if (kind == PyUnicode_WCHAR_KIND) {
5303 assert(i < _PyUnicode_WSTR_LENGTH(v));
5304 }
5305 else {
5306 /* The only case in which i == ascii_length is a backslash
5307 followed by a newline. */
5308 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5309 }
5310
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005311 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005315 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5316 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5317 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5318 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5319 /* FF */
5320 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5321 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5322 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5323 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5324 /* VT */
5325 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5326 /* BEL, not classic C */
5327 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 case '0': case '1': case '2': case '3':
5331 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005332 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005333 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005334 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005335 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005336 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005338 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 break;
5340
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 /* hex escapes */
5342 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005344 digits = 2;
5345 message = "truncated \\xXX escape";
5346 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005350 digits = 4;
5351 message = "truncated \\uXXXX escape";
5352 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005355 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005356 digits = 8;
5357 message = "truncated \\UXXXXXXXX escape";
5358 hexescape:
5359 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005360 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 if (s+digits>end) {
5362 endinpos = size;
5363 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 errors, &errorHandler,
5365 "unicodeescape", "end of string in escape sequence",
5366 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005369 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 goto nextByte;
5371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 for (j = 0; j < digits; ++j) {
5373 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005374 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005375 endinpos = (s+j+1)-starts;
5376 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 errors, &errorHandler,
5379 "unicodeescape", message,
5380 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005381 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005382 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005385 }
5386 chr = (chr<<4) & ~0xF;
5387 if (c >= '0' && c <= '9')
5388 chr += c - '0';
5389 else if (c >= 'a' && c <= 'f')
5390 chr += 10 + c - 'a';
5391 else
5392 chr += 10 + c - 'A';
5393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005394 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005395 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005396 /* _decoding_error will have already written into the
5397 target buffer. */
5398 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005399 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005400 /* when we get here, chr is a 32-bit unicode character */
5401 if (chr <= 0xffff)
5402 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005403 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005404 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005405 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005406 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005407#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005409#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005410 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005411 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5412 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005413#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005414 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005415 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 errors, &errorHandler,
5419 "unicodeescape", "illegal Unicode character",
5420 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005421 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005422 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005423 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005424 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005425 break;
5426
Benjamin Peterson29060642009-01-31 22:14:21 +00005427 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005428 case 'N':
5429 message = "malformed \\N character escape";
5430 if (ucnhash_CAPI == NULL) {
5431 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005432 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5433 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005434 if (ucnhash_CAPI == NULL)
5435 goto ucnhashError;
5436 }
5437 if (*s == '{') {
5438 const char *start = s+1;
5439 /* look for the closing brace */
5440 while (*s != '}' && s < end)
5441 s++;
5442 if (s > start && s < end && *s == '}') {
5443 /* found a name. look it up in the unicode database */
5444 message = "unknown Unicode character name";
5445 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005446 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5447 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005448 goto store;
5449 }
5450 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005451 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 errors, &errorHandler,
5455 "unicodeescape", message,
5456 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005457 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005458 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005459 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005460 break;
5461
5462 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005463 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005465 message = "\\ at end of string";
5466 s--;
5467 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005469 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 errors, &errorHandler,
5471 "unicodeescape", message,
5472 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005473 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005474 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005475 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005476 }
5477 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5479 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005480 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005481 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005484 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 /* Ensure the length prediction worked in case of ASCII strings */
5487 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5488
Victor Stinnerfe226c02011-10-03 03:52:20 +02005489 if (kind == PyUnicode_WCHAR_KIND)
5490 {
5491 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5492 goto onError;
5493 if (PyUnicode_READY(v) == -1)
5494 goto onError;
5495 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005496 Py_XDECREF(errorHandler);
5497 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005499
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005501 PyErr_SetString(
5502 PyExc_UnicodeError,
5503 "\\N escapes not supported (can't load unicodedata module)"
5504 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005505 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 Py_XDECREF(errorHandler);
5507 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005508 return NULL;
5509
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 Py_XDECREF(errorHandler);
5513 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 return NULL;
5515}
5516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517#undef WRITE_ASCII_OR_WSTR
5518#undef WRITE_WSTR
5519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520/* Return a Unicode-Escape string version of the Unicode object.
5521
5522 If quotes is true, the string is enclosed in u"" or u'' quotes as
5523 appropriate.
5524
5525*/
5526
Walter Dörwald79e913e2007-05-12 11:08:06 +00005527static const char *hexdigits = "0123456789abcdef";
5528
Alexander Belopolsky40018472011-02-26 01:02:56 +00005529PyObject *
5530PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005531 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005533 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005536#ifdef Py_UNICODE_WIDE
5537 const Py_ssize_t expandsize = 10;
5538#else
5539 const Py_ssize_t expandsize = 6;
5540#endif
5541
Thomas Wouters89f507f2006-12-13 04:49:30 +00005542 /* XXX(nnorwitz): rather than over-allocating, it would be
5543 better to choose a different scheme. Perhaps scan the
5544 first N-chars of the string and allocate based on that size.
5545 */
5546 /* Initial allocation is based on the longest-possible unichr
5547 escape.
5548
5549 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5550 unichr, so in this case it's the longest unichr escape. In
5551 narrow (UTF-16) builds this is five chars per source unichr
5552 since there are two unichrs in the surrogate pair, so in narrow
5553 (UTF-16) builds it's not the longest unichr escape.
5554
5555 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5556 so in the narrow (UTF-16) build case it's the longest unichr
5557 escape.
5558 */
5559
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005560 if (size == 0)
5561 return PyBytes_FromStringAndSize(NULL, 0);
5562
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005563 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005565
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005566 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 2
5568 + expandsize*size
5569 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 if (repr == NULL)
5571 return NULL;
5572
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005573 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 while (size-- > 0) {
5576 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005577
Walter Dörwald79e913e2007-05-12 11:08:06 +00005578 /* Escape backslashes */
5579 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 *p++ = '\\';
5581 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005582 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005583 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005584
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005585#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005586 /* Map 21-bit characters to '\U00xxxxxx' */
5587 else if (ch >= 0x10000) {
5588 *p++ = '\\';
5589 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005590 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5591 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5592 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5593 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5594 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5595 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5596 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5597 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005599 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005600#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5602 else if (ch >= 0xD800 && ch < 0xDC00) {
5603 Py_UNICODE ch2;
5604 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005605
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 ch2 = *s++;
5607 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005608 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5610 *p++ = '\\';
5611 *p++ = 'U';
5612 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5613 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5614 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5615 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5616 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5617 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5618 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5619 *p++ = hexdigits[ucs & 0x0000000F];
5620 continue;
5621 }
5622 /* Fall through: isolated surrogates are copied as-is */
5623 s--;
5624 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005625 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005626#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005627
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005629 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 *p++ = '\\';
5631 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005632 *p++ = hexdigits[(ch >> 12) & 0x000F];
5633 *p++ = hexdigits[(ch >> 8) & 0x000F];
5634 *p++ = hexdigits[(ch >> 4) & 0x000F];
5635 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005637
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005638 /* Map special whitespace to '\t', \n', '\r' */
5639 else if (ch == '\t') {
5640 *p++ = '\\';
5641 *p++ = 't';
5642 }
5643 else if (ch == '\n') {
5644 *p++ = '\\';
5645 *p++ = 'n';
5646 }
5647 else if (ch == '\r') {
5648 *p++ = '\\';
5649 *p++ = 'r';
5650 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005651
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005652 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005653 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005655 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005656 *p++ = hexdigits[(ch >> 4) & 0x000F];
5657 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005658 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005659
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 /* Copy everything else as-is */
5661 else
5662 *p++ = (char) ch;
5663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005665 assert(p - PyBytes_AS_STRING(repr) > 0);
5666 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5667 return NULL;
5668 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669}
5670
Alexander Belopolsky40018472011-02-26 01:02:56 +00005671PyObject *
5672PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005674 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (!PyUnicode_Check(unicode)) {
5676 PyErr_BadArgument();
5677 return NULL;
5678 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005679 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5680 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005681 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682}
5683
5684/* --- Raw Unicode Escape Codec ------------------------------------------- */
5685
Alexander Belopolsky40018472011-02-26 01:02:56 +00005686PyObject *
5687PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005688 Py_ssize_t size,
5689 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005692 Py_ssize_t startinpos;
5693 Py_ssize_t endinpos;
5694 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 const char *end;
5698 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 PyObject *errorHandler = NULL;
5700 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 /* Escaped strings will always be longer than the resulting
5703 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 length after conversion to the true value. (But decoding error
5705 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 v = _PyUnicode_New(size);
5707 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 end = s + size;
5713 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 unsigned char c;
5715 Py_UCS4 x;
5716 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005717 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 /* Non-escape characters are interpreted as Unicode ordinals */
5720 if (*s != '\\') {
5721 *p++ = (unsigned char)*s++;
5722 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 startinpos = s-starts;
5725
5726 /* \u-escapes are only interpreted iff the number of leading
5727 backslashes if odd */
5728 bs = s;
5729 for (;s < end;) {
5730 if (*s != '\\')
5731 break;
5732 *p++ = (unsigned char)*s++;
5733 }
5734 if (((s - bs) & 1) == 0 ||
5735 s >= end ||
5736 (*s != 'u' && *s != 'U')) {
5737 continue;
5738 }
5739 p--;
5740 count = *s=='u' ? 4 : 8;
5741 s++;
5742
5743 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5744 outpos = p-PyUnicode_AS_UNICODE(v);
5745 for (x = 0, i = 0; i < count; ++i, ++s) {
5746 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005747 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 endinpos = s-starts;
5749 if (unicode_decode_call_errorhandler(
5750 errors, &errorHandler,
5751 "rawunicodeescape", "truncated \\uXXXX",
5752 &starts, &end, &startinpos, &endinpos, &exc, &s,
5753 &v, &outpos, &p))
5754 goto onError;
5755 goto nextByte;
5756 }
5757 x = (x<<4) & ~0xF;
5758 if (c >= '0' && c <= '9')
5759 x += c - '0';
5760 else if (c >= 'a' && c <= 'f')
5761 x += 10 + c - 'a';
5762 else
5763 x += 10 + c - 'A';
5764 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005765 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 /* UCS-2 character */
5767 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005768 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 /* UCS-4 character. Either store directly, or as
5770 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005771#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005773#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 x -= 0x10000L;
5775 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5776 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005777#endif
5778 } else {
5779 endinpos = s-starts;
5780 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005781 if (unicode_decode_call_errorhandler(
5782 errors, &errorHandler,
5783 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 &starts, &end, &startinpos, &endinpos, &exc, &s,
5785 &v, &outpos, &p))
5786 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005787 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 nextByte:
5789 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005791 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 Py_XDECREF(errorHandler);
5794 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 if (PyUnicode_READY(v) == -1) {
5796 Py_DECREF(v);
5797 return NULL;
5798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005800
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 Py_XDECREF(errorHandler);
5804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 return NULL;
5806}
5807
Alexander Belopolsky40018472011-02-26 01:02:56 +00005808PyObject *
5809PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005810 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005812 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 char *p;
5814 char *q;
5815
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005816#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005817 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005818#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005819 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005820#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005821
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005822 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005824
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005825 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 if (repr == NULL)
5827 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005828 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005829 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 while (size-- > 0) {
5833 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005834#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 /* Map 32-bit characters to '\Uxxxxxxxx' */
5836 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005837 *p++ = '\\';
5838 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005839 *p++ = hexdigits[(ch >> 28) & 0xf];
5840 *p++ = hexdigits[(ch >> 24) & 0xf];
5841 *p++ = hexdigits[(ch >> 20) & 0xf];
5842 *p++ = hexdigits[(ch >> 16) & 0xf];
5843 *p++ = hexdigits[(ch >> 12) & 0xf];
5844 *p++ = hexdigits[(ch >> 8) & 0xf];
5845 *p++ = hexdigits[(ch >> 4) & 0xf];
5846 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005847 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005848 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005849#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5851 if (ch >= 0xD800 && ch < 0xDC00) {
5852 Py_UNICODE ch2;
5853 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 ch2 = *s++;
5856 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005857 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5859 *p++ = '\\';
5860 *p++ = 'U';
5861 *p++ = hexdigits[(ucs >> 28) & 0xf];
5862 *p++ = hexdigits[(ucs >> 24) & 0xf];
5863 *p++ = hexdigits[(ucs >> 20) & 0xf];
5864 *p++ = hexdigits[(ucs >> 16) & 0xf];
5865 *p++ = hexdigits[(ucs >> 12) & 0xf];
5866 *p++ = hexdigits[(ucs >> 8) & 0xf];
5867 *p++ = hexdigits[(ucs >> 4) & 0xf];
5868 *p++ = hexdigits[ucs & 0xf];
5869 continue;
5870 }
5871 /* Fall through: isolated surrogates are copied as-is */
5872 s--;
5873 size++;
5874 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005875#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 /* Map 16-bit characters to '\uxxxx' */
5877 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 *p++ = '\\';
5879 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005880 *p++ = hexdigits[(ch >> 12) & 0xf];
5881 *p++ = hexdigits[(ch >> 8) & 0xf];
5882 *p++ = hexdigits[(ch >> 4) & 0xf];
5883 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 /* Copy everything else as-is */
5886 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 *p++ = (char) ch;
5888 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005889 size = p - q;
5890
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005891 assert(size > 0);
5892 if (_PyBytes_Resize(&repr, size) < 0)
5893 return NULL;
5894 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895}
5896
Alexander Belopolsky40018472011-02-26 01:02:56 +00005897PyObject *
5898PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005900 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005902 PyErr_BadArgument();
5903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005905 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5906 PyUnicode_GET_SIZE(unicode));
5907
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005908 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909}
5910
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005911/* --- Unicode Internal Codec ------------------------------------------- */
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
5914_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005915 Py_ssize_t size,
5916 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005917{
5918 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005919 Py_ssize_t startinpos;
5920 Py_ssize_t endinpos;
5921 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005922 PyUnicodeObject *v;
5923 Py_UNICODE *p;
5924 const char *end;
5925 const char *reason;
5926 PyObject *errorHandler = NULL;
5927 PyObject *exc = NULL;
5928
Neal Norwitzd43069c2006-01-08 01:12:10 +00005929#ifdef Py_UNICODE_WIDE
5930 Py_UNICODE unimax = PyUnicode_GetMax();
5931#endif
5932
Thomas Wouters89f507f2006-12-13 04:49:30 +00005933 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005934 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5935 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005937 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5938 as string was created with the old API. */
5939 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005941 p = PyUnicode_AS_UNICODE(v);
5942 end = s + size;
5943
5944 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005945 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005946 /* We have to sanity check the raw data, otherwise doom looms for
5947 some malformed UCS-4 data. */
5948 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005949#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005950 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005951#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005952 end-s < Py_UNICODE_SIZE
5953 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005955 startinpos = s - starts;
5956 if (end-s < Py_UNICODE_SIZE) {
5957 endinpos = end-starts;
5958 reason = "truncated input";
5959 }
5960 else {
5961 endinpos = s - starts + Py_UNICODE_SIZE;
5962 reason = "illegal code point (> 0x10FFFF)";
5963 }
5964 outpos = p - PyUnicode_AS_UNICODE(v);
5965 if (unicode_decode_call_errorhandler(
5966 errors, &errorHandler,
5967 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005968 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005969 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005970 goto onError;
5971 }
5972 }
5973 else {
5974 p++;
5975 s += Py_UNICODE_SIZE;
5976 }
5977 }
5978
Victor Stinnerfe226c02011-10-03 03:52:20 +02005979 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005980 goto onError;
5981 Py_XDECREF(errorHandler);
5982 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 if (PyUnicode_READY(v) == -1) {
5984 Py_DECREF(v);
5985 return NULL;
5986 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005987 return (PyObject *)v;
5988
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005990 Py_XDECREF(v);
5991 Py_XDECREF(errorHandler);
5992 Py_XDECREF(exc);
5993 return NULL;
5994}
5995
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996/* --- Latin-1 Codec ------------------------------------------------------ */
5997
Alexander Belopolsky40018472011-02-26 01:02:56 +00005998PyObject *
5999PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006000 Py_ssize_t size,
6001 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006004 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005}
6006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006008static void
6009make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006010 const char *encoding,
6011 const Py_UNICODE *unicode, Py_ssize_t size,
6012 Py_ssize_t startpos, Py_ssize_t endpos,
6013 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 *exceptionObject = PyUnicodeEncodeError_Create(
6017 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 }
6019 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6021 goto onError;
6022 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6023 goto onError;
6024 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6025 goto onError;
6026 return;
6027 onError:
6028 Py_DECREF(*exceptionObject);
6029 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 }
6031}
6032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034static void
6035raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006036 const char *encoding,
6037 const Py_UNICODE *unicode, Py_ssize_t size,
6038 Py_ssize_t startpos, Py_ssize_t endpos,
6039 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040{
6041 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045}
6046
6047/* error handling callback helper:
6048 build arguments, call the callback and check the arguments,
6049 put the result into newpos and return the replacement string, which
6050 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006051static PyObject *
6052unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006053 PyObject **errorHandler,
6054 const char *encoding, const char *reason,
6055 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6056 Py_ssize_t startpos, Py_ssize_t endpos,
6057 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006059 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060
6061 PyObject *restuple;
6062 PyObject *resunicode;
6063
6064 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 }
6069
6070 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074
6075 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006080 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 Py_DECREF(restuple);
6082 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006084 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 &resunicode, newpos)) {
6086 Py_DECREF(restuple);
6087 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006088 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006089 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6090 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6091 Py_DECREF(restuple);
6092 return NULL;
6093 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006096 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6098 Py_DECREF(restuple);
6099 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006100 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 Py_INCREF(resunicode);
6102 Py_DECREF(restuple);
6103 return resunicode;
6104}
6105
Alexander Belopolsky40018472011-02-26 01:02:56 +00006106static PyObject *
6107unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006108 Py_ssize_t size,
6109 const char *errors,
6110 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111{
6112 /* output object */
6113 PyObject *res;
6114 /* pointers to the beginning and end+1 of input */
6115 const Py_UNICODE *startp = p;
6116 const Py_UNICODE *endp = p + size;
6117 /* pointer to the beginning of the unencodable characters */
6118 /* const Py_UNICODE *badp = NULL; */
6119 /* pointer into the output */
6120 char *str;
6121 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006122 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006123 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6124 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 PyObject *errorHandler = NULL;
6126 PyObject *exc = NULL;
6127 /* the following variable is used for caching string comparisons
6128 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6129 int known_errorHandler = -1;
6130
6131 /* allocate enough for a simple encoding without
6132 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006133 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006134 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006135 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006137 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 ressize = size;
6140
6141 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 /* can we encode this? */
6145 if (c<limit) {
6146 /* no overflow check, because we know that the space is enough */
6147 *str++ = (char)c;
6148 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006149 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 else {
6151 Py_ssize_t unicodepos = p-startp;
6152 Py_ssize_t requiredsize;
6153 PyObject *repunicode;
6154 Py_ssize_t repsize;
6155 Py_ssize_t newpos;
6156 Py_ssize_t respos;
6157 Py_UNICODE *uni2;
6158 /* startpos for collecting unencodable chars */
6159 const Py_UNICODE *collstart = p;
6160 const Py_UNICODE *collend = p;
6161 /* find all unecodable characters */
6162 while ((collend < endp) && ((*collend)>=limit))
6163 ++collend;
6164 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6165 if (known_errorHandler==-1) {
6166 if ((errors==NULL) || (!strcmp(errors, "strict")))
6167 known_errorHandler = 1;
6168 else if (!strcmp(errors, "replace"))
6169 known_errorHandler = 2;
6170 else if (!strcmp(errors, "ignore"))
6171 known_errorHandler = 3;
6172 else if (!strcmp(errors, "xmlcharrefreplace"))
6173 known_errorHandler = 4;
6174 else
6175 known_errorHandler = 0;
6176 }
6177 switch (known_errorHandler) {
6178 case 1: /* strict */
6179 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6180 goto onError;
6181 case 2: /* replace */
6182 while (collstart++<collend)
6183 *str++ = '?'; /* fall through */
6184 case 3: /* ignore */
6185 p = collend;
6186 break;
6187 case 4: /* xmlcharrefreplace */
6188 respos = str - PyBytes_AS_STRING(res);
6189 /* determine replacement size (temporarily (mis)uses p) */
6190 for (p = collstart, repsize = 0; p < collend; ++p) {
6191 if (*p<10)
6192 repsize += 2+1+1;
6193 else if (*p<100)
6194 repsize += 2+2+1;
6195 else if (*p<1000)
6196 repsize += 2+3+1;
6197 else if (*p<10000)
6198 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006199#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 else
6201 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006202#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 else if (*p<100000)
6204 repsize += 2+5+1;
6205 else if (*p<1000000)
6206 repsize += 2+6+1;
6207 else
6208 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006209#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 }
6211 requiredsize = respos+repsize+(endp-collend);
6212 if (requiredsize > ressize) {
6213 if (requiredsize<2*ressize)
6214 requiredsize = 2*ressize;
6215 if (_PyBytes_Resize(&res, requiredsize))
6216 goto onError;
6217 str = PyBytes_AS_STRING(res) + respos;
6218 ressize = requiredsize;
6219 }
6220 /* generate replacement (temporarily (mis)uses p) */
6221 for (p = collstart; p < collend; ++p) {
6222 str += sprintf(str, "&#%d;", (int)*p);
6223 }
6224 p = collend;
6225 break;
6226 default:
6227 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6228 encoding, reason, startp, size, &exc,
6229 collstart-startp, collend-startp, &newpos);
6230 if (repunicode == NULL)
6231 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006232 if (PyBytes_Check(repunicode)) {
6233 /* Directly copy bytes result to output. */
6234 repsize = PyBytes_Size(repunicode);
6235 if (repsize > 1) {
6236 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006237 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006238 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6239 Py_DECREF(repunicode);
6240 goto onError;
6241 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006242 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006243 ressize += repsize-1;
6244 }
6245 memcpy(str, PyBytes_AsString(repunicode), repsize);
6246 str += repsize;
6247 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006248 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006249 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006250 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* need more space? (at least enough for what we
6252 have+the replacement+the rest of the string, so
6253 we won't have to check space for encodable characters) */
6254 respos = str - PyBytes_AS_STRING(res);
6255 repsize = PyUnicode_GET_SIZE(repunicode);
6256 requiredsize = respos+repsize+(endp-collend);
6257 if (requiredsize > ressize) {
6258 if (requiredsize<2*ressize)
6259 requiredsize = 2*ressize;
6260 if (_PyBytes_Resize(&res, requiredsize)) {
6261 Py_DECREF(repunicode);
6262 goto onError;
6263 }
6264 str = PyBytes_AS_STRING(res) + respos;
6265 ressize = requiredsize;
6266 }
6267 /* check if there is anything unencodable in the replacement
6268 and copy it to the output */
6269 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6270 c = *uni2;
6271 if (c >= limit) {
6272 raise_encode_exception(&exc, encoding, startp, size,
6273 unicodepos, unicodepos+1, reason);
6274 Py_DECREF(repunicode);
6275 goto onError;
6276 }
6277 *str = (char)c;
6278 }
6279 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006280 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006282 }
6283 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006284 /* Resize if we allocated to much */
6285 size = str - PyBytes_AS_STRING(res);
6286 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006287 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006288 if (_PyBytes_Resize(&res, size) < 0)
6289 goto onError;
6290 }
6291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 Py_XDECREF(errorHandler);
6293 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006294 return res;
6295
6296 onError:
6297 Py_XDECREF(res);
6298 Py_XDECREF(errorHandler);
6299 Py_XDECREF(exc);
6300 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301}
6302
Alexander Belopolsky40018472011-02-26 01:02:56 +00006303PyObject *
6304PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006305 Py_ssize_t size,
6306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006308 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309}
6310
Alexander Belopolsky40018472011-02-26 01:02:56 +00006311PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006312_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313{
6314 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 PyErr_BadArgument();
6316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006318 if (PyUnicode_READY(unicode) == -1)
6319 return NULL;
6320 /* Fast path: if it is a one-byte string, construct
6321 bytes object directly. */
6322 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6323 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6324 PyUnicode_GET_LENGTH(unicode));
6325 /* Non-Latin-1 characters present. Defer to above function to
6326 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006329 errors);
6330}
6331
6332PyObject*
6333PyUnicode_AsLatin1String(PyObject *unicode)
6334{
6335 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336}
6337
6338/* --- 7-bit ASCII Codec -------------------------------------------------- */
6339
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340PyObject *
6341PyUnicode_DecodeASCII(const char *s,
6342 Py_ssize_t size,
6343 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 PyUnicodeObject *v;
6347 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006348 Py_ssize_t startinpos;
6349 Py_ssize_t endinpos;
6350 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006352 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 PyObject *errorHandler = NULL;
6354 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006355 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006356
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006358 if (size == 1 && *(unsigned char*)s < 128)
6359 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6360
6361 /* Fast path. Assume the input actually *is* ASCII, and allocate
6362 a single-block Unicode object with that assumption. If there is
6363 an error, drop the object and start over. */
6364 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6365 if (v == NULL)
6366 goto onError;
6367 d = PyUnicode_1BYTE_DATA(v);
6368 for (i = 0; i < size; i++) {
6369 unsigned char ch = ((unsigned char*)s)[i];
6370 if (ch < 128)
6371 d[i] = ch;
6372 else
6373 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006375 if (i == size)
6376 return (PyObject*)v;
6377 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006378
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 v = _PyUnicode_New(size);
6380 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 e = s + size;
6386 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 register unsigned char c = (unsigned char)*s;
6388 if (c < 128) {
6389 *p++ = c;
6390 ++s;
6391 }
6392 else {
6393 startinpos = s-starts;
6394 endinpos = startinpos + 1;
6395 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6396 if (unicode_decode_call_errorhandler(
6397 errors, &errorHandler,
6398 "ascii", "ordinal not in range(128)",
6399 &starts, &e, &startinpos, &endinpos, &exc, &s,
6400 &v, &outpos, &p))
6401 goto onError;
6402 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006404 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006405 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 Py_XDECREF(errorHandler);
6408 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006409 if (PyUnicode_READY(v) == -1) {
6410 Py_DECREF(v);
6411 return NULL;
6412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006414
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 Py_XDECREF(errorHandler);
6418 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 return NULL;
6420}
6421
Alexander Belopolsky40018472011-02-26 01:02:56 +00006422PyObject *
6423PyUnicode_EncodeASCII(const Py_UNICODE *p,
6424 Py_ssize_t size,
6425 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428}
6429
Alexander Belopolsky40018472011-02-26 01:02:56 +00006430PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006431_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432{
6433 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 PyErr_BadArgument();
6435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006437 if (PyUnicode_READY(unicode) == -1)
6438 return NULL;
6439 /* Fast path: if it is an ASCII-only string, construct bytes object
6440 directly. Else defer to above function to raise the exception. */
6441 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6442 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6443 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006446 errors);
6447}
6448
6449PyObject *
6450PyUnicode_AsASCIIString(PyObject *unicode)
6451{
6452 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453}
6454
Victor Stinner99b95382011-07-04 14:23:54 +02006455#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006456
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006457/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006458
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006459#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006460#define NEED_RETRY
6461#endif
6462
6463/* XXX This code is limited to "true" double-byte encodings, as
6464 a) it assumes an incomplete character consists of a single byte, and
6465 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006467
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468static int
6469is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006470{
6471 const char *curr = s + offset;
6472
6473 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 const char *prev = CharPrev(s, curr);
6475 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006476 }
6477 return 0;
6478}
6479
6480/*
6481 * Decode MBCS string into unicode object. If 'final' is set, converts
6482 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6483 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006484static int
6485decode_mbcs(PyUnicodeObject **v,
6486 const char *s, /* MBCS string */
6487 int size, /* sizeof MBCS string */
6488 int final,
6489 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006490{
6491 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006492 Py_ssize_t n;
6493 DWORD usize;
6494 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006495
6496 assert(size >= 0);
6497
Victor Stinner554f3f02010-06-16 23:33:54 +00006498 /* check and handle 'errors' arg */
6499 if (errors==NULL || strcmp(errors, "strict")==0)
6500 flags = MB_ERR_INVALID_CHARS;
6501 else if (strcmp(errors, "ignore")==0)
6502 flags = 0;
6503 else {
6504 PyErr_Format(PyExc_ValueError,
6505 "mbcs encoding does not support errors='%s'",
6506 errors);
6507 return -1;
6508 }
6509
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006510 /* Skip trailing lead-byte unless 'final' is set */
6511 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006513
6514 /* First get the size of the result */
6515 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006516 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6517 if (usize==0)
6518 goto mbcs_decode_error;
6519 } else
6520 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006521
6522 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 /* Create unicode object */
6524 *v = _PyUnicode_New(usize);
6525 if (*v == NULL)
6526 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006527 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006528 }
6529 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 /* Extend unicode object */
6531 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006532 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006534 }
6535
6536 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006537 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006539 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6540 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006542 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006543 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006544
6545mbcs_decode_error:
6546 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6547 we raise a UnicodeDecodeError - else it is a 'generic'
6548 windows error
6549 */
6550 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6551 /* Ideally, we should get reason from FormatMessage - this
6552 is the Windows 2000 English version of the message
6553 */
6554 PyObject *exc = NULL;
6555 const char *reason = "No mapping for the Unicode character exists "
6556 "in the target multi-byte code page.";
6557 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6558 if (exc != NULL) {
6559 PyCodec_StrictErrors(exc);
6560 Py_DECREF(exc);
6561 }
6562 } else {
6563 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6564 }
6565 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006566}
6567
Alexander Belopolsky40018472011-02-26 01:02:56 +00006568PyObject *
6569PyUnicode_DecodeMBCSStateful(const char *s,
6570 Py_ssize_t size,
6571 const char *errors,
6572 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573{
6574 PyUnicodeObject *v = NULL;
6575 int done;
6576
6577 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579
6580#ifdef NEED_RETRY
6581 retry:
6582 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006583 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006584 else
6585#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006586 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587
6588 if (done < 0) {
6589 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006591 }
6592
6593 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006595
6596#ifdef NEED_RETRY
6597 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 s += done;
6599 size -= done;
6600 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601 }
6602#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006603 if (PyUnicode_READY(v) == -1) {
6604 Py_DECREF(v);
6605 return NULL;
6606 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006607 return (PyObject *)v;
6608}
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610PyObject *
6611PyUnicode_DecodeMBCS(const char *s,
6612 Py_ssize_t size,
6613 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006614{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006615 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6616}
6617
6618/*
6619 * Convert unicode into string object (MBCS).
6620 * Returns 0 if succeed, -1 otherwise.
6621 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006622static int
6623encode_mbcs(PyObject **repr,
6624 const Py_UNICODE *p, /* unicode */
6625 int size, /* size of unicode */
6626 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006627{
Victor Stinner554f3f02010-06-16 23:33:54 +00006628 BOOL usedDefaultChar = FALSE;
6629 BOOL *pusedDefaultChar;
6630 int mbcssize;
6631 Py_ssize_t n;
6632 PyObject *exc = NULL;
6633 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634
6635 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006636
Victor Stinner554f3f02010-06-16 23:33:54 +00006637 /* check and handle 'errors' arg */
6638 if (errors==NULL || strcmp(errors, "strict")==0) {
6639 flags = WC_NO_BEST_FIT_CHARS;
6640 pusedDefaultChar = &usedDefaultChar;
6641 } else if (strcmp(errors, "replace")==0) {
6642 flags = 0;
6643 pusedDefaultChar = NULL;
6644 } else {
6645 PyErr_Format(PyExc_ValueError,
6646 "mbcs encoding does not support errors='%s'",
6647 errors);
6648 return -1;
6649 }
6650
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006651 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006652 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006653 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6654 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 if (mbcssize == 0) {
6656 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6657 return -1;
6658 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006659 /* If we used a default char, then we failed! */
6660 if (pusedDefaultChar && *pusedDefaultChar)
6661 goto mbcs_encode_error;
6662 } else {
6663 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006664 }
6665
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006666 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 /* Create string object */
6668 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6669 if (*repr == NULL)
6670 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006671 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006672 }
6673 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 /* Extend string object */
6675 n = PyBytes_Size(*repr);
6676 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6677 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678 }
6679
6680 /* Do the conversion */
6681 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006683 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6684 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6686 return -1;
6687 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006688 if (pusedDefaultChar && *pusedDefaultChar)
6689 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006690 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006692
6693mbcs_encode_error:
6694 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6695 Py_XDECREF(exc);
6696 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006697}
6698
Alexander Belopolsky40018472011-02-26 01:02:56 +00006699PyObject *
6700PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6701 Py_ssize_t size,
6702 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006703{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704 PyObject *repr = NULL;
6705 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006706
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006709 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006710 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711 else
6712#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006713 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006714
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 Py_XDECREF(repr);
6717 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006718 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006719
6720#ifdef NEED_RETRY
6721 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 p += INT_MAX;
6723 size -= INT_MAX;
6724 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725 }
6726#endif
6727
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006728 return repr;
6729}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006730
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731PyObject *
6732PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006733{
6734 if (!PyUnicode_Check(unicode)) {
6735 PyErr_BadArgument();
6736 return NULL;
6737 }
6738 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 PyUnicode_GET_SIZE(unicode),
6740 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006741}
6742
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006743#undef NEED_RETRY
6744
Victor Stinner99b95382011-07-04 14:23:54 +02006745#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006746
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747/* --- Character Mapping Codec -------------------------------------------- */
6748
Alexander Belopolsky40018472011-02-26 01:02:56 +00006749PyObject *
6750PyUnicode_DecodeCharmap(const char *s,
6751 Py_ssize_t size,
6752 PyObject *mapping,
6753 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006756 Py_ssize_t startinpos;
6757 Py_ssize_t endinpos;
6758 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006759 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 PyUnicodeObject *v;
6761 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006762 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763 PyObject *errorHandler = NULL;
6764 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006765 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006766 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006767
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 /* Default to Latin-1 */
6769 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771
6772 v = _PyUnicode_New(size);
6773 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006779 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 mapstring = PyUnicode_AS_UNICODE(mapping);
6781 maplen = PyUnicode_GET_SIZE(mapping);
6782 while (s < e) {
6783 unsigned char ch = *s;
6784 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 if (ch < maplen)
6787 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 if (x == 0xfffe) {
6790 /* undefined mapping */
6791 outpos = p-PyUnicode_AS_UNICODE(v);
6792 startinpos = s-starts;
6793 endinpos = startinpos+1;
6794 if (unicode_decode_call_errorhandler(
6795 errors, &errorHandler,
6796 "charmap", "character maps to <undefined>",
6797 &starts, &e, &startinpos, &endinpos, &exc, &s,
6798 &v, &outpos, &p)) {
6799 goto onError;
6800 }
6801 continue;
6802 }
6803 *p++ = x;
6804 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006805 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006806 }
6807 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 while (s < e) {
6809 unsigned char ch = *s;
6810 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006811
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6813 w = PyLong_FromLong((long)ch);
6814 if (w == NULL)
6815 goto onError;
6816 x = PyObject_GetItem(mapping, w);
6817 Py_DECREF(w);
6818 if (x == NULL) {
6819 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6820 /* No mapping found means: mapping is undefined. */
6821 PyErr_Clear();
6822 x = Py_None;
6823 Py_INCREF(x);
6824 } else
6825 goto onError;
6826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006827
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 /* Apply mapping */
6829 if (PyLong_Check(x)) {
6830 long value = PyLong_AS_LONG(x);
6831 if (value < 0 || value > 65535) {
6832 PyErr_SetString(PyExc_TypeError,
6833 "character mapping must be in range(65536)");
6834 Py_DECREF(x);
6835 goto onError;
6836 }
6837 *p++ = (Py_UNICODE)value;
6838 }
6839 else if (x == Py_None) {
6840 /* undefined mapping */
6841 outpos = p-PyUnicode_AS_UNICODE(v);
6842 startinpos = s-starts;
6843 endinpos = startinpos+1;
6844 if (unicode_decode_call_errorhandler(
6845 errors, &errorHandler,
6846 "charmap", "character maps to <undefined>",
6847 &starts, &e, &startinpos, &endinpos, &exc, &s,
6848 &v, &outpos, &p)) {
6849 Py_DECREF(x);
6850 goto onError;
6851 }
6852 Py_DECREF(x);
6853 continue;
6854 }
6855 else if (PyUnicode_Check(x)) {
6856 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006857
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (targetsize == 1)
6859 /* 1-1 mapping */
6860 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006861
Benjamin Peterson29060642009-01-31 22:14:21 +00006862 else if (targetsize > 1) {
6863 /* 1-n mapping */
6864 if (targetsize > extrachars) {
6865 /* resize first */
6866 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6867 Py_ssize_t needed = (targetsize - extrachars) + \
6868 (targetsize << 2);
6869 extrachars += needed;
6870 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006871 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 PyUnicode_GET_SIZE(v) + needed) < 0) {
6873 Py_DECREF(x);
6874 goto onError;
6875 }
6876 p = PyUnicode_AS_UNICODE(v) + oldpos;
6877 }
6878 Py_UNICODE_COPY(p,
6879 PyUnicode_AS_UNICODE(x),
6880 targetsize);
6881 p += targetsize;
6882 extrachars -= targetsize;
6883 }
6884 /* 1-0 mapping: skip the character */
6885 }
6886 else {
6887 /* wrong return value */
6888 PyErr_SetString(PyExc_TypeError,
6889 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006890 Py_DECREF(x);
6891 goto onError;
6892 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 Py_DECREF(x);
6894 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 }
6897 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006898 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006900 Py_XDECREF(errorHandler);
6901 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006902 if (PyUnicode_READY(v) == -1) {
6903 Py_DECREF(v);
6904 return NULL;
6905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006907
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006909 Py_XDECREF(errorHandler);
6910 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 Py_XDECREF(v);
6912 return NULL;
6913}
6914
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006915/* Charmap encoding: the lookup table */
6916
Alexander Belopolsky40018472011-02-26 01:02:56 +00006917struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 PyObject_HEAD
6919 unsigned char level1[32];
6920 int count2, count3;
6921 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006922};
6923
6924static PyObject*
6925encoding_map_size(PyObject *obj, PyObject* args)
6926{
6927 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006930}
6931
6932static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006933 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 PyDoc_STR("Return the size (in bytes) of this object") },
6935 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006936};
6937
6938static void
6939encoding_map_dealloc(PyObject* o)
6940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006941 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006942}
6943
6944static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006945 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 "EncodingMap", /*tp_name*/
6947 sizeof(struct encoding_map), /*tp_basicsize*/
6948 0, /*tp_itemsize*/
6949 /* methods */
6950 encoding_map_dealloc, /*tp_dealloc*/
6951 0, /*tp_print*/
6952 0, /*tp_getattr*/
6953 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006954 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 0, /*tp_repr*/
6956 0, /*tp_as_number*/
6957 0, /*tp_as_sequence*/
6958 0, /*tp_as_mapping*/
6959 0, /*tp_hash*/
6960 0, /*tp_call*/
6961 0, /*tp_str*/
6962 0, /*tp_getattro*/
6963 0, /*tp_setattro*/
6964 0, /*tp_as_buffer*/
6965 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6966 0, /*tp_doc*/
6967 0, /*tp_traverse*/
6968 0, /*tp_clear*/
6969 0, /*tp_richcompare*/
6970 0, /*tp_weaklistoffset*/
6971 0, /*tp_iter*/
6972 0, /*tp_iternext*/
6973 encoding_map_methods, /*tp_methods*/
6974 0, /*tp_members*/
6975 0, /*tp_getset*/
6976 0, /*tp_base*/
6977 0, /*tp_dict*/
6978 0, /*tp_descr_get*/
6979 0, /*tp_descr_set*/
6980 0, /*tp_dictoffset*/
6981 0, /*tp_init*/
6982 0, /*tp_alloc*/
6983 0, /*tp_new*/
6984 0, /*tp_free*/
6985 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006986};
6987
6988PyObject*
6989PyUnicode_BuildEncodingMap(PyObject* string)
6990{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006991 PyObject *result;
6992 struct encoding_map *mresult;
6993 int i;
6994 int need_dict = 0;
6995 unsigned char level1[32];
6996 unsigned char level2[512];
6997 unsigned char *mlevel1, *mlevel2, *mlevel3;
6998 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006999 int kind;
7000 void *data;
7001 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007003 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007004 PyErr_BadArgument();
7005 return NULL;
7006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007007 kind = PyUnicode_KIND(string);
7008 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007009 memset(level1, 0xFF, sizeof level1);
7010 memset(level2, 0xFF, sizeof level2);
7011
7012 /* If there isn't a one-to-one mapping of NULL to \0,
7013 or if there are non-BMP characters, we need to use
7014 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007015 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007016 need_dict = 1;
7017 for (i = 1; i < 256; i++) {
7018 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019 ch = PyUnicode_READ(kind, data, i);
7020 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007021 need_dict = 1;
7022 break;
7023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007024 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007025 /* unmapped character */
7026 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007027 l1 = ch >> 11;
7028 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007029 if (level1[l1] == 0xFF)
7030 level1[l1] = count2++;
7031 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007032 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007033 }
7034
7035 if (count2 >= 0xFF || count3 >= 0xFF)
7036 need_dict = 1;
7037
7038 if (need_dict) {
7039 PyObject *result = PyDict_New();
7040 PyObject *key, *value;
7041 if (!result)
7042 return NULL;
7043 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007044 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007045 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007046 if (!key || !value)
7047 goto failed1;
7048 if (PyDict_SetItem(result, key, value) == -1)
7049 goto failed1;
7050 Py_DECREF(key);
7051 Py_DECREF(value);
7052 }
7053 return result;
7054 failed1:
7055 Py_XDECREF(key);
7056 Py_XDECREF(value);
7057 Py_DECREF(result);
7058 return NULL;
7059 }
7060
7061 /* Create a three-level trie */
7062 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7063 16*count2 + 128*count3 - 1);
7064 if (!result)
7065 return PyErr_NoMemory();
7066 PyObject_Init(result, &EncodingMapType);
7067 mresult = (struct encoding_map*)result;
7068 mresult->count2 = count2;
7069 mresult->count3 = count3;
7070 mlevel1 = mresult->level1;
7071 mlevel2 = mresult->level23;
7072 mlevel3 = mresult->level23 + 16*count2;
7073 memcpy(mlevel1, level1, 32);
7074 memset(mlevel2, 0xFF, 16*count2);
7075 memset(mlevel3, 0, 128*count3);
7076 count3 = 0;
7077 for (i = 1; i < 256; i++) {
7078 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007079 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007080 /* unmapped character */
7081 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007082 o1 = PyUnicode_READ(kind, data, i)>>11;
7083 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007084 i2 = 16*mlevel1[o1] + o2;
7085 if (mlevel2[i2] == 0xFF)
7086 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007087 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007088 i3 = 128*mlevel2[i2] + o3;
7089 mlevel3[i3] = i;
7090 }
7091 return result;
7092}
7093
7094static int
7095encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7096{
7097 struct encoding_map *map = (struct encoding_map*)mapping;
7098 int l1 = c>>11;
7099 int l2 = (c>>7) & 0xF;
7100 int l3 = c & 0x7F;
7101 int i;
7102
7103#ifdef Py_UNICODE_WIDE
7104 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007106 }
7107#endif
7108 if (c == 0)
7109 return 0;
7110 /* level 1*/
7111 i = map->level1[l1];
7112 if (i == 0xFF) {
7113 return -1;
7114 }
7115 /* level 2*/
7116 i = map->level23[16*i+l2];
7117 if (i == 0xFF) {
7118 return -1;
7119 }
7120 /* level 3 */
7121 i = map->level23[16*map->count2 + 128*i + l3];
7122 if (i == 0) {
7123 return -1;
7124 }
7125 return i;
7126}
7127
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007128/* Lookup the character ch in the mapping. If the character
7129 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007130 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007131static PyObject *
7132charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133{
Christian Heimes217cfd12007-12-02 14:31:20 +00007134 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 PyObject *x;
7136
7137 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 x = PyObject_GetItem(mapping, w);
7140 Py_DECREF(w);
7141 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7143 /* No mapping found means: mapping is undefined. */
7144 PyErr_Clear();
7145 x = Py_None;
7146 Py_INCREF(x);
7147 return x;
7148 } else
7149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007151 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007152 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007153 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 long value = PyLong_AS_LONG(x);
7155 if (value < 0 || value > 255) {
7156 PyErr_SetString(PyExc_TypeError,
7157 "character mapping must be in range(256)");
7158 Py_DECREF(x);
7159 return NULL;
7160 }
7161 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007163 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 /* wrong return value */
7167 PyErr_Format(PyExc_TypeError,
7168 "character mapping must return integer, bytes or None, not %.400s",
7169 x->ob_type->tp_name);
7170 Py_DECREF(x);
7171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 }
7173}
7174
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007175static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007176charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007177{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007178 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7179 /* exponentially overallocate to minimize reallocations */
7180 if (requiredsize < 2*outsize)
7181 requiredsize = 2*outsize;
7182 if (_PyBytes_Resize(outobj, requiredsize))
7183 return -1;
7184 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007185}
7186
Benjamin Peterson14339b62009-01-31 16:36:08 +00007187typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007189} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007191 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192 space is available. Return a new reference to the object that
7193 was put in the output buffer, or Py_None, if the mapping was undefined
7194 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007195 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007196static charmapencode_result
7197charmapencode_output(Py_UNICODE c, PyObject *mapping,
7198 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007200 PyObject *rep;
7201 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007202 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007203
Christian Heimes90aa7642007-12-19 02:45:37 +00007204 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007205 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007207 if (res == -1)
7208 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 if (outsize<requiredsize)
7210 if (charmapencode_resize(outobj, outpos, requiredsize))
7211 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007212 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 outstart[(*outpos)++] = (char)res;
7214 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007215 }
7216
7217 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007218 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007220 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 Py_DECREF(rep);
7222 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007223 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 if (PyLong_Check(rep)) {
7225 Py_ssize_t requiredsize = *outpos+1;
7226 if (outsize<requiredsize)
7227 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7228 Py_DECREF(rep);
7229 return enc_EXCEPTION;
7230 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007231 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 else {
7235 const char *repchars = PyBytes_AS_STRING(rep);
7236 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7237 Py_ssize_t requiredsize = *outpos+repsize;
7238 if (outsize<requiredsize)
7239 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7240 Py_DECREF(rep);
7241 return enc_EXCEPTION;
7242 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007243 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 memcpy(outstart + *outpos, repchars, repsize);
7245 *outpos += repsize;
7246 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007247 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007248 Py_DECREF(rep);
7249 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250}
7251
7252/* handle an error in PyUnicode_EncodeCharmap
7253 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007254static int
7255charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007256 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007258 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007259 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260{
7261 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007262 Py_ssize_t repsize;
7263 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264 Py_UNICODE *uni2;
7265 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007266 Py_ssize_t collstartpos = *inpos;
7267 Py_ssize_t collendpos = *inpos+1;
7268 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 char *encoding = "charmap";
7270 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007273 /* find all unencodable characters */
7274 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007275 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007276 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 int res = encoding_map_lookup(p[collendpos], mapping);
7278 if (res != -1)
7279 break;
7280 ++collendpos;
7281 continue;
7282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007283
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 rep = charmapencode_lookup(p[collendpos], mapping);
7285 if (rep==NULL)
7286 return -1;
7287 else if (rep!=Py_None) {
7288 Py_DECREF(rep);
7289 break;
7290 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007291 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293 }
7294 /* cache callback name lookup
7295 * (if not done yet, i.e. it's the first error) */
7296 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 if ((errors==NULL) || (!strcmp(errors, "strict")))
7298 *known_errorHandler = 1;
7299 else if (!strcmp(errors, "replace"))
7300 *known_errorHandler = 2;
7301 else if (!strcmp(errors, "ignore"))
7302 *known_errorHandler = 3;
7303 else if (!strcmp(errors, "xmlcharrefreplace"))
7304 *known_errorHandler = 4;
7305 else
7306 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 }
7308 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007309 case 1: /* strict */
7310 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7311 return -1;
7312 case 2: /* replace */
7313 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 x = charmapencode_output('?', mapping, res, respos);
7315 if (x==enc_EXCEPTION) {
7316 return -1;
7317 }
7318 else if (x==enc_FAILED) {
7319 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7320 return -1;
7321 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007322 }
7323 /* fall through */
7324 case 3: /* ignore */
7325 *inpos = collendpos;
7326 break;
7327 case 4: /* xmlcharrefreplace */
7328 /* generate replacement (temporarily (mis)uses p) */
7329 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 char buffer[2+29+1+1];
7331 char *cp;
7332 sprintf(buffer, "&#%d;", (int)p[collpos]);
7333 for (cp = buffer; *cp; ++cp) {
7334 x = charmapencode_output(*cp, mapping, res, respos);
7335 if (x==enc_EXCEPTION)
7336 return -1;
7337 else if (x==enc_FAILED) {
7338 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7339 return -1;
7340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007341 }
7342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343 *inpos = collendpos;
7344 break;
7345 default:
7346 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 encoding, reason, p, size, exceptionObject,
7348 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007349 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007351 if (PyBytes_Check(repunicode)) {
7352 /* Directly copy bytes result to output. */
7353 Py_ssize_t outsize = PyBytes_Size(*res);
7354 Py_ssize_t requiredsize;
7355 repsize = PyBytes_Size(repunicode);
7356 requiredsize = *respos + repsize;
7357 if (requiredsize > outsize)
7358 /* Make room for all additional bytes. */
7359 if (charmapencode_resize(res, respos, requiredsize)) {
7360 Py_DECREF(repunicode);
7361 return -1;
7362 }
7363 memcpy(PyBytes_AsString(*res) + *respos,
7364 PyBytes_AsString(repunicode), repsize);
7365 *respos += repsize;
7366 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007367 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007368 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007370 /* generate replacement */
7371 repsize = PyUnicode_GET_SIZE(repunicode);
7372 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 x = charmapencode_output(*uni2, mapping, res, respos);
7374 if (x==enc_EXCEPTION) {
7375 return -1;
7376 }
7377 else if (x==enc_FAILED) {
7378 Py_DECREF(repunicode);
7379 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7380 return -1;
7381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 }
7383 *inpos = newpos;
7384 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 }
7386 return 0;
7387}
7388
Alexander Belopolsky40018472011-02-26 01:02:56 +00007389PyObject *
7390PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7391 Py_ssize_t size,
7392 PyObject *mapping,
7393 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007395 /* output object */
7396 PyObject *res = NULL;
7397 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007398 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007399 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007400 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007401 PyObject *errorHandler = NULL;
7402 PyObject *exc = NULL;
7403 /* the following variable is used for caching string comparisons
7404 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7405 * 3=ignore, 4=xmlcharrefreplace */
7406 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407
7408 /* Default to Latin-1 */
7409 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007412 /* allocate enough for a simple encoding without
7413 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007414 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415 if (res == NULL)
7416 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007417 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007420 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 /* try to encode it */
7422 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7423 if (x==enc_EXCEPTION) /* error */
7424 goto onError;
7425 if (x==enc_FAILED) { /* unencodable character */
7426 if (charmap_encoding_error(p, size, &inpos, mapping,
7427 &exc,
7428 &known_errorHandler, &errorHandler, errors,
7429 &res, &respos)) {
7430 goto onError;
7431 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 else
7434 /* done with this character => adjust input position */
7435 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007438 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007439 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007440 if (_PyBytes_Resize(&res, respos) < 0)
7441 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007442
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443 Py_XDECREF(exc);
7444 Py_XDECREF(errorHandler);
7445 return res;
7446
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007448 Py_XDECREF(res);
7449 Py_XDECREF(exc);
7450 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 return NULL;
7452}
7453
Alexander Belopolsky40018472011-02-26 01:02:56 +00007454PyObject *
7455PyUnicode_AsCharmapString(PyObject *unicode,
7456 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457{
7458 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 PyErr_BadArgument();
7460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 }
7462 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 PyUnicode_GET_SIZE(unicode),
7464 mapping,
7465 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466}
7467
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007468/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007469static void
7470make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007471 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007472 Py_ssize_t startpos, Py_ssize_t endpos,
7473 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007475 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007476 *exceptionObject = _PyUnicodeTranslateError_Create(
7477 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 }
7479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7481 goto onError;
7482 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7483 goto onError;
7484 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7485 goto onError;
7486 return;
7487 onError:
7488 Py_DECREF(*exceptionObject);
7489 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 }
7491}
7492
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007493/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007494static void
7495raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497 Py_ssize_t startpos, Py_ssize_t endpos,
7498 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499{
7500 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007501 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504}
7505
7506/* error handling callback helper:
7507 build arguments, call the callback and check the arguments,
7508 put the result into newpos and return the replacement string, which
7509 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007510static PyObject *
7511unicode_translate_call_errorhandler(const char *errors,
7512 PyObject **errorHandler,
7513 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007514 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007515 Py_ssize_t startpos, Py_ssize_t endpos,
7516 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007517{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007518 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007520 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521 PyObject *restuple;
7522 PyObject *resunicode;
7523
7524 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007526 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528 }
7529
7530 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007531 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007532 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534
7535 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007540 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 Py_DECREF(restuple);
7542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543 }
7544 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007545 &resunicode, &i_newpos)) {
7546 Py_DECREF(restuple);
7547 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007548 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007549 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007550 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007551 else
7552 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007553 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7555 Py_DECREF(restuple);
7556 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007558 Py_INCREF(resunicode);
7559 Py_DECREF(restuple);
7560 return resunicode;
7561}
7562
7563/* Lookup the character ch in the mapping and put the result in result,
7564 which must be decrefed by the caller.
7565 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007566static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568{
Christian Heimes217cfd12007-12-02 14:31:20 +00007569 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007570 PyObject *x;
7571
7572 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007574 x = PyObject_GetItem(mapping, w);
7575 Py_DECREF(w);
7576 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7578 /* No mapping found means: use 1:1 mapping. */
7579 PyErr_Clear();
7580 *result = NULL;
7581 return 0;
7582 } else
7583 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007584 }
7585 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 *result = x;
7587 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007589 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 long value = PyLong_AS_LONG(x);
7591 long max = PyUnicode_GetMax();
7592 if (value < 0 || value > max) {
7593 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007594 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 Py_DECREF(x);
7596 return -1;
7597 }
7598 *result = x;
7599 return 0;
7600 }
7601 else if (PyUnicode_Check(x)) {
7602 *result = x;
7603 return 0;
7604 }
7605 else {
7606 /* wrong return value */
7607 PyErr_SetString(PyExc_TypeError,
7608 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007609 Py_DECREF(x);
7610 return -1;
7611 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612}
7613/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 if not reallocate and adjust various state variables.
7615 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007616static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007617charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007621 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 /* exponentially overallocate to minimize reallocations */
7623 if (requiredsize < 2 * oldsize)
7624 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007625 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7626 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007628 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007629 }
7630 return 0;
7631}
7632/* lookup the character, put the result in the output string and adjust
7633 various state variables. Return a new reference to the object that
7634 was put in the output buffer in *result, or Py_None, if the mapping was
7635 undefined (in which case no character was written).
7636 The called must decref result.
7637 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007638static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007639charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7640 PyObject *mapping, Py_UCS4 **output,
7641 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007642 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007644 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7645 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007649 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 }
7651 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007653 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007655 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007656 }
7657 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007658 Py_ssize_t repsize;
7659 if (PyUnicode_READY(*res) == -1)
7660 return -1;
7661 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 if (repsize==1) {
7663 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007664 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 }
7666 else if (repsize!=0) {
7667 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007668 Py_ssize_t requiredsize = *opos +
7669 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007671 Py_ssize_t i;
7672 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007674 for(i = 0; i < repsize; i++)
7675 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007677 }
7678 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 return 0;
7681}
7682
Alexander Belopolsky40018472011-02-26 01:02:56 +00007683PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684_PyUnicode_TranslateCharmap(PyObject *input,
7685 PyObject *mapping,
7686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 /* input object */
7689 char *idata;
7690 Py_ssize_t size, i;
7691 int kind;
7692 /* output buffer */
7693 Py_UCS4 *output = NULL;
7694 Py_ssize_t osize;
7695 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007696 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007697 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698 char *reason = "character maps to <undefined>";
7699 PyObject *errorHandler = NULL;
7700 PyObject *exc = NULL;
7701 /* the following variable is used for caching string comparisons
7702 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7703 * 3=ignore, 4=xmlcharrefreplace */
7704 int known_errorHandler = -1;
7705
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 PyErr_BadArgument();
7708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007711 if (PyUnicode_READY(input) == -1)
7712 return NULL;
7713 idata = (char*)PyUnicode_DATA(input);
7714 kind = PyUnicode_KIND(input);
7715 size = PyUnicode_GET_LENGTH(input);
7716 i = 0;
7717
7718 if (size == 0) {
7719 Py_INCREF(input);
7720 return input;
7721 }
7722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 /* allocate enough for a simple 1:1 translation without
7724 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007725 osize = size;
7726 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7727 opos = 0;
7728 if (output == NULL) {
7729 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007733 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 /* try to encode it */
7735 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007736 if (charmaptranslate_output(input, i, mapping,
7737 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 Py_XDECREF(x);
7739 goto onError;
7740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007741 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007743 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 else { /* untranslatable character */
7745 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7746 Py_ssize_t repsize;
7747 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007748 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750 Py_ssize_t collstart = i;
7751 Py_ssize_t collend = i+1;
7752 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
Benjamin Peterson29060642009-01-31 22:14:21 +00007754 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007755 while (collend < size) {
7756 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 goto onError;
7758 Py_XDECREF(x);
7759 if (x!=Py_None)
7760 break;
7761 ++collend;
7762 }
7763 /* cache callback name lookup
7764 * (if not done yet, i.e. it's the first error) */
7765 if (known_errorHandler==-1) {
7766 if ((errors==NULL) || (!strcmp(errors, "strict")))
7767 known_errorHandler = 1;
7768 else if (!strcmp(errors, "replace"))
7769 known_errorHandler = 2;
7770 else if (!strcmp(errors, "ignore"))
7771 known_errorHandler = 3;
7772 else if (!strcmp(errors, "xmlcharrefreplace"))
7773 known_errorHandler = 4;
7774 else
7775 known_errorHandler = 0;
7776 }
7777 switch (known_errorHandler) {
7778 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007779 raise_translate_exception(&exc, input, collstart,
7780 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007781 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 case 2: /* replace */
7783 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 for (coll = collstart; coll<collend; coll++)
7785 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 /* fall through */
7787 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 break;
7790 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 /* generate replacement (temporarily (mis)uses i) */
7792 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 char buffer[2+29+1+1];
7794 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007795 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7796 if (charmaptranslate_makespace(&output, &osize,
7797 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 goto onError;
7799 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007800 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 break;
7804 default:
7805 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 reason, input, &exc,
7807 collstart, collend, &newpos);
7808 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 goto onError;
7810 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007811 repsize = PyUnicode_GET_LENGTH(repunicode);
7812 if (charmaptranslate_makespace(&output, &osize,
7813 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 Py_DECREF(repunicode);
7815 goto onError;
7816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 for (uni2 = 0; repsize-->0; ++uni2)
7818 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7819 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007822 }
7823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7825 if (!res)
7826 goto onError;
7827 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007828 Py_XDECREF(exc);
7829 Py_XDECREF(errorHandler);
7830 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007833 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007834 Py_XDECREF(exc);
7835 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 return NULL;
7837}
7838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839/* Deprecated. Use PyUnicode_Translate instead. */
7840PyObject *
7841PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7842 Py_ssize_t size,
7843 PyObject *mapping,
7844 const char *errors)
7845{
7846 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7847 if (!unicode)
7848 return NULL;
7849 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7850}
7851
Alexander Belopolsky40018472011-02-26 01:02:56 +00007852PyObject *
7853PyUnicode_Translate(PyObject *str,
7854 PyObject *mapping,
7855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856{
7857 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007858
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 str = PyUnicode_FromObject(str);
7860 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863 Py_DECREF(str);
7864 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007865
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867 Py_XDECREF(str);
7868 return NULL;
7869}
Tim Petersced69f82003-09-16 20:30:58 +00007870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871static Py_UCS4
7872fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7873{
7874 /* No need to call PyUnicode_READY(self) because this function is only
7875 called as a callback from fixup() which does it already. */
7876 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7877 const int kind = PyUnicode_KIND(self);
7878 void *data = PyUnicode_DATA(self);
7879 Py_UCS4 maxchar = 0, ch, fixed;
7880 Py_ssize_t i;
7881
7882 for (i = 0; i < len; ++i) {
7883 ch = PyUnicode_READ(kind, data, i);
7884 fixed = 0;
7885 if (ch > 127) {
7886 if (Py_UNICODE_ISSPACE(ch))
7887 fixed = ' ';
7888 else {
7889 const int decimal = Py_UNICODE_TODECIMAL(ch);
7890 if (decimal >= 0)
7891 fixed = '0' + decimal;
7892 }
7893 if (fixed != 0) {
7894 if (fixed > maxchar)
7895 maxchar = fixed;
7896 PyUnicode_WRITE(kind, data, i, fixed);
7897 }
7898 else if (ch > maxchar)
7899 maxchar = ch;
7900 }
7901 else if (ch > maxchar)
7902 maxchar = ch;
7903 }
7904
7905 return maxchar;
7906}
7907
7908PyObject *
7909_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7910{
7911 if (!PyUnicode_Check(unicode)) {
7912 PyErr_BadInternalCall();
7913 return NULL;
7914 }
7915 if (PyUnicode_READY(unicode) == -1)
7916 return NULL;
7917 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7918 /* If the string is already ASCII, just return the same string */
7919 Py_INCREF(unicode);
7920 return unicode;
7921 }
7922 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7923}
7924
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007925PyObject *
7926PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7927 Py_ssize_t length)
7928{
7929 PyObject *result;
7930 Py_UNICODE *p; /* write pointer into result */
7931 Py_ssize_t i;
7932 /* Copy to a new string */
7933 result = (PyObject *)_PyUnicode_New(length);
7934 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7935 if (result == NULL)
7936 return result;
7937 p = PyUnicode_AS_UNICODE(result);
7938 /* Iterate over code points */
7939 for (i = 0; i < length; i++) {
7940 Py_UNICODE ch =s[i];
7941 if (ch > 127) {
7942 int decimal = Py_UNICODE_TODECIMAL(ch);
7943 if (decimal >= 0)
7944 p[i] = '0' + decimal;
7945 }
7946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007947 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7948 Py_DECREF(result);
7949 return NULL;
7950 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007951 return result;
7952}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007953/* --- Decimal Encoder ---------------------------------------------------- */
7954
Alexander Belopolsky40018472011-02-26 01:02:56 +00007955int
7956PyUnicode_EncodeDecimal(Py_UNICODE *s,
7957 Py_ssize_t length,
7958 char *output,
7959 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007960{
7961 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962 PyObject *errorHandler = NULL;
7963 PyObject *exc = NULL;
7964 const char *encoding = "decimal";
7965 const char *reason = "invalid decimal Unicode string";
7966 /* the following variable is used for caching string comparisons
7967 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7968 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007969
7970 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 PyErr_BadArgument();
7972 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007973 }
7974
7975 p = s;
7976 end = s + length;
7977 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 register Py_UNICODE ch = *p;
7979 int decimal;
7980 PyObject *repunicode;
7981 Py_ssize_t repsize;
7982 Py_ssize_t newpos;
7983 Py_UNICODE *uni2;
7984 Py_UNICODE *collstart;
7985 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007986
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007988 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 ++p;
7990 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 decimal = Py_UNICODE_TODECIMAL(ch);
7993 if (decimal >= 0) {
7994 *output++ = '0' + decimal;
7995 ++p;
7996 continue;
7997 }
7998 if (0 < ch && ch < 256) {
7999 *output++ = (char)ch;
8000 ++p;
8001 continue;
8002 }
8003 /* All other characters are considered unencodable */
8004 collstart = p;
8005 collend = p+1;
8006 while (collend < end) {
8007 if ((0 < *collend && *collend < 256) ||
8008 !Py_UNICODE_ISSPACE(*collend) ||
8009 Py_UNICODE_TODECIMAL(*collend))
8010 break;
8011 }
8012 /* cache callback name lookup
8013 * (if not done yet, i.e. it's the first error) */
8014 if (known_errorHandler==-1) {
8015 if ((errors==NULL) || (!strcmp(errors, "strict")))
8016 known_errorHandler = 1;
8017 else if (!strcmp(errors, "replace"))
8018 known_errorHandler = 2;
8019 else if (!strcmp(errors, "ignore"))
8020 known_errorHandler = 3;
8021 else if (!strcmp(errors, "xmlcharrefreplace"))
8022 known_errorHandler = 4;
8023 else
8024 known_errorHandler = 0;
8025 }
8026 switch (known_errorHandler) {
8027 case 1: /* strict */
8028 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8029 goto onError;
8030 case 2: /* replace */
8031 for (p = collstart; p < collend; ++p)
8032 *output++ = '?';
8033 /* fall through */
8034 case 3: /* ignore */
8035 p = collend;
8036 break;
8037 case 4: /* xmlcharrefreplace */
8038 /* generate replacement (temporarily (mis)uses p) */
8039 for (p = collstart; p < collend; ++p)
8040 output += sprintf(output, "&#%d;", (int)*p);
8041 p = collend;
8042 break;
8043 default:
8044 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8045 encoding, reason, s, length, &exc,
8046 collstart-s, collend-s, &newpos);
8047 if (repunicode == NULL)
8048 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008049 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008050 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008051 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8052 Py_DECREF(repunicode);
8053 goto onError;
8054 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 /* generate replacement */
8056 repsize = PyUnicode_GET_SIZE(repunicode);
8057 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8058 Py_UNICODE ch = *uni2;
8059 if (Py_UNICODE_ISSPACE(ch))
8060 *output++ = ' ';
8061 else {
8062 decimal = Py_UNICODE_TODECIMAL(ch);
8063 if (decimal >= 0)
8064 *output++ = '0' + decimal;
8065 else if (0 < ch && ch < 256)
8066 *output++ = (char)ch;
8067 else {
8068 Py_DECREF(repunicode);
8069 raise_encode_exception(&exc, encoding,
8070 s, length, collstart-s, collend-s, reason);
8071 goto onError;
8072 }
8073 }
8074 }
8075 p = s + newpos;
8076 Py_DECREF(repunicode);
8077 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008078 }
8079 /* 0-terminate the output string */
8080 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 Py_XDECREF(exc);
8082 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008083 return 0;
8084
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 Py_XDECREF(exc);
8087 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008088 return -1;
8089}
8090
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091/* --- Helpers ------------------------------------------------------------ */
8092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008093#include "stringlib/ucs1lib.h"
8094#include "stringlib/fastsearch.h"
8095#include "stringlib/partition.h"
8096#include "stringlib/split.h"
8097#include "stringlib/count.h"
8098#include "stringlib/find.h"
8099#include "stringlib/localeutil.h"
8100#include "stringlib/undef.h"
8101
8102#include "stringlib/ucs2lib.h"
8103#include "stringlib/fastsearch.h"
8104#include "stringlib/partition.h"
8105#include "stringlib/split.h"
8106#include "stringlib/count.h"
8107#include "stringlib/find.h"
8108#include "stringlib/localeutil.h"
8109#include "stringlib/undef.h"
8110
8111#include "stringlib/ucs4lib.h"
8112#include "stringlib/fastsearch.h"
8113#include "stringlib/partition.h"
8114#include "stringlib/split.h"
8115#include "stringlib/count.h"
8116#include "stringlib/find.h"
8117#include "stringlib/localeutil.h"
8118#include "stringlib/undef.h"
8119
8120static Py_ssize_t
8121any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8122 const Py_UCS1*, Py_ssize_t,
8123 Py_ssize_t, Py_ssize_t),
8124 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8125 const Py_UCS2*, Py_ssize_t,
8126 Py_ssize_t, Py_ssize_t),
8127 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8128 const Py_UCS4*, Py_ssize_t,
8129 Py_ssize_t, Py_ssize_t),
8130 PyObject* s1, PyObject* s2,
8131 Py_ssize_t start,
8132 Py_ssize_t end)
8133{
8134 int kind1, kind2, kind;
8135 void *buf1, *buf2;
8136 Py_ssize_t len1, len2, result;
8137
8138 kind1 = PyUnicode_KIND(s1);
8139 kind2 = PyUnicode_KIND(s2);
8140 kind = kind1 > kind2 ? kind1 : kind2;
8141 buf1 = PyUnicode_DATA(s1);
8142 buf2 = PyUnicode_DATA(s2);
8143 if (kind1 != kind)
8144 buf1 = _PyUnicode_AsKind(s1, kind);
8145 if (!buf1)
8146 return -2;
8147 if (kind2 != kind)
8148 buf2 = _PyUnicode_AsKind(s2, kind);
8149 if (!buf2) {
8150 if (kind1 != kind) PyMem_Free(buf1);
8151 return -2;
8152 }
8153 len1 = PyUnicode_GET_LENGTH(s1);
8154 len2 = PyUnicode_GET_LENGTH(s2);
8155
8156 switch(kind) {
8157 case PyUnicode_1BYTE_KIND:
8158 result = ucs1(buf1, len1, buf2, len2, start, end);
8159 break;
8160 case PyUnicode_2BYTE_KIND:
8161 result = ucs2(buf1, len1, buf2, len2, start, end);
8162 break;
8163 case PyUnicode_4BYTE_KIND:
8164 result = ucs4(buf1, len1, buf2, len2, start, end);
8165 break;
8166 default:
8167 assert(0); result = -2;
8168 }
8169
8170 if (kind1 != kind)
8171 PyMem_Free(buf1);
8172 if (kind2 != kind)
8173 PyMem_Free(buf2);
8174
8175 return result;
8176}
8177
8178Py_ssize_t
8179_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8180 Py_ssize_t n_buffer,
8181 void *digits, Py_ssize_t n_digits,
8182 Py_ssize_t min_width,
8183 const char *grouping,
8184 const char *thousands_sep)
8185{
8186 switch(kind) {
8187 case PyUnicode_1BYTE_KIND:
8188 return _PyUnicode_ucs1_InsertThousandsGrouping(
8189 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8190 min_width, grouping, thousands_sep);
8191 case PyUnicode_2BYTE_KIND:
8192 return _PyUnicode_ucs2_InsertThousandsGrouping(
8193 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8194 min_width, grouping, thousands_sep);
8195 case PyUnicode_4BYTE_KIND:
8196 return _PyUnicode_ucs4_InsertThousandsGrouping(
8197 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8198 min_width, grouping, thousands_sep);
8199 }
8200 assert(0);
8201 return -1;
8202}
8203
8204
Eric Smith8c663262007-08-25 02:26:07 +00008205#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008206#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008207
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208#include "stringlib/count.h"
8209#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008210
Thomas Wouters477c8d52006-05-27 19:21:47 +00008211/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008212#define ADJUST_INDICES(start, end, len) \
8213 if (end > len) \
8214 end = len; \
8215 else if (end < 0) { \
8216 end += len; \
8217 if (end < 0) \
8218 end = 0; \
8219 } \
8220 if (start < 0) { \
8221 start += len; \
8222 if (start < 0) \
8223 start = 0; \
8224 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008225
Alexander Belopolsky40018472011-02-26 01:02:56 +00008226Py_ssize_t
8227PyUnicode_Count(PyObject *str,
8228 PyObject *substr,
8229 Py_ssize_t start,
8230 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008232 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008233 PyUnicodeObject* str_obj;
8234 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 int kind1, kind2, kind;
8236 void *buf1 = NULL, *buf2 = NULL;
8237 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008238
Thomas Wouters477c8d52006-05-27 19:21:47 +00008239 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008242 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008243 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 Py_DECREF(str_obj);
8245 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 }
Tim Petersced69f82003-09-16 20:30:58 +00008247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 kind1 = PyUnicode_KIND(str_obj);
8249 kind2 = PyUnicode_KIND(sub_obj);
8250 kind = kind1 > kind2 ? kind1 : kind2;
8251 buf1 = PyUnicode_DATA(str_obj);
8252 if (kind1 != kind)
8253 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8254 if (!buf1)
8255 goto onError;
8256 buf2 = PyUnicode_DATA(sub_obj);
8257 if (kind2 != kind)
8258 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8259 if (!buf2)
8260 goto onError;
8261 len1 = PyUnicode_GET_LENGTH(str_obj);
8262 len2 = PyUnicode_GET_LENGTH(sub_obj);
8263
8264 ADJUST_INDICES(start, end, len1);
8265 switch(kind) {
8266 case PyUnicode_1BYTE_KIND:
8267 result = ucs1lib_count(
8268 ((Py_UCS1*)buf1) + start, end - start,
8269 buf2, len2, PY_SSIZE_T_MAX
8270 );
8271 break;
8272 case PyUnicode_2BYTE_KIND:
8273 result = ucs2lib_count(
8274 ((Py_UCS2*)buf1) + start, end - start,
8275 buf2, len2, PY_SSIZE_T_MAX
8276 );
8277 break;
8278 case PyUnicode_4BYTE_KIND:
8279 result = ucs4lib_count(
8280 ((Py_UCS4*)buf1) + start, end - start,
8281 buf2, len2, PY_SSIZE_T_MAX
8282 );
8283 break;
8284 default:
8285 assert(0); result = 0;
8286 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008287
8288 Py_DECREF(sub_obj);
8289 Py_DECREF(str_obj);
8290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 if (kind1 != kind)
8292 PyMem_Free(buf1);
8293 if (kind2 != kind)
8294 PyMem_Free(buf2);
8295
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008297 onError:
8298 Py_DECREF(sub_obj);
8299 Py_DECREF(str_obj);
8300 if (kind1 != kind && buf1)
8301 PyMem_Free(buf1);
8302 if (kind2 != kind && buf2)
8303 PyMem_Free(buf2);
8304 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305}
8306
Alexander Belopolsky40018472011-02-26 01:02:56 +00008307Py_ssize_t
8308PyUnicode_Find(PyObject *str,
8309 PyObject *sub,
8310 Py_ssize_t start,
8311 Py_ssize_t end,
8312 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008314 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008315
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008319 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 Py_DECREF(str);
8322 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
Tim Petersced69f82003-09-16 20:30:58 +00008324
Thomas Wouters477c8d52006-05-27 19:21:47 +00008325 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 result = any_find_slice(
8327 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8328 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008329 );
8330 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 result = any_find_slice(
8332 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8333 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008334 );
8335
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008337 Py_DECREF(sub);
8338
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 return result;
8340}
8341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342Py_ssize_t
8343PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8344 Py_ssize_t start, Py_ssize_t end,
8345 int direction)
8346{
8347 char *result;
8348 int kind;
8349 if (PyUnicode_READY(str) == -1)
8350 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008351 if (start < 0 || end < 0) {
8352 PyErr_SetString(PyExc_IndexError, "string index out of range");
8353 return -2;
8354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 if (end > PyUnicode_GET_LENGTH(str))
8356 end = PyUnicode_GET_LENGTH(str);
8357 kind = PyUnicode_KIND(str);
8358 result = findchar(PyUnicode_1BYTE_DATA(str)
8359 + PyUnicode_KIND_SIZE(kind, start),
8360 kind,
8361 end-start, ch, direction);
8362 if (!result)
8363 return -1;
8364 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8365}
8366
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367static int
8368tailmatch(PyUnicodeObject *self,
8369 PyUnicodeObject *substring,
8370 Py_ssize_t start,
8371 Py_ssize_t end,
8372 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 int kind_self;
8375 int kind_sub;
8376 void *data_self;
8377 void *data_sub;
8378 Py_ssize_t offset;
8379 Py_ssize_t i;
8380 Py_ssize_t end_sub;
8381
8382 if (PyUnicode_READY(self) == -1 ||
8383 PyUnicode_READY(substring) == -1)
8384 return 0;
8385
8386 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 return 1;
8388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8390 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 kind_self = PyUnicode_KIND(self);
8395 data_self = PyUnicode_DATA(self);
8396 kind_sub = PyUnicode_KIND(substring);
8397 data_sub = PyUnicode_DATA(substring);
8398 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8399
8400 if (direction > 0)
8401 offset = end;
8402 else
8403 offset = start;
8404
8405 if (PyUnicode_READ(kind_self, data_self, offset) ==
8406 PyUnicode_READ(kind_sub, data_sub, 0) &&
8407 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8408 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8409 /* If both are of the same kind, memcmp is sufficient */
8410 if (kind_self == kind_sub) {
8411 return ! memcmp((char *)data_self +
8412 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8413 data_sub,
8414 PyUnicode_GET_LENGTH(substring) *
8415 PyUnicode_CHARACTER_SIZE(substring));
8416 }
8417 /* otherwise we have to compare each character by first accesing it */
8418 else {
8419 /* We do not need to compare 0 and len(substring)-1 because
8420 the if statement above ensured already that they are equal
8421 when we end up here. */
8422 // TODO: honor direction and do a forward or backwards search
8423 for (i = 1; i < end_sub; ++i) {
8424 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8425 PyUnicode_READ(kind_sub, data_sub, i))
8426 return 0;
8427 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 }
8431
8432 return 0;
8433}
8434
Alexander Belopolsky40018472011-02-26 01:02:56 +00008435Py_ssize_t
8436PyUnicode_Tailmatch(PyObject *str,
8437 PyObject *substr,
8438 Py_ssize_t start,
8439 Py_ssize_t end,
8440 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008442 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008443
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 str = PyUnicode_FromObject(str);
8445 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 substr = PyUnicode_FromObject(substr);
8448 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 Py_DECREF(str);
8450 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 }
Tim Petersced69f82003-09-16 20:30:58 +00008452
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 (PyUnicodeObject *)substr,
8455 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 Py_DECREF(str);
8457 Py_DECREF(substr);
8458 return result;
8459}
8460
Guido van Rossumd57fd912000-03-10 22:53:23 +00008461/* Apply fixfct filter to the Unicode object self and return a
8462 reference to the modified object */
8463
Alexander Belopolsky40018472011-02-26 01:02:56 +00008464static PyObject *
8465fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 PyObject *u;
8469 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 if (PyUnicode_READY(self) == -1)
8472 return NULL;
8473 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8474 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8475 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8480 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 /* fix functions return the new maximum character in a string,
8483 if the kind of the resulting unicode object does not change,
8484 everything is fine. Otherwise we need to change the string kind
8485 and re-run the fix function. */
8486 maxchar_new = fixfct((PyUnicodeObject*)u);
8487 if (maxchar_new == 0)
8488 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8489 else if (maxchar_new <= 127)
8490 maxchar_new = 127;
8491 else if (maxchar_new <= 255)
8492 maxchar_new = 255;
8493 else if (maxchar_new <= 65535)
8494 maxchar_new = 65535;
8495 else
8496 maxchar_new = 1114111; /* 0x10ffff */
8497
8498 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* fixfct should return TRUE if it modified the buffer. If
8500 FALSE, return a reference to the original buffer instead
8501 (to save space, not time) */
8502 Py_INCREF(self);
8503 Py_DECREF(u);
8504 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 else if (maxchar_new == maxchar_old) {
8507 return u;
8508 }
8509 else {
8510 /* In case the maximum character changed, we need to
8511 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008512 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 if (v == NULL) {
8514 Py_DECREF(u);
8515 return NULL;
8516 }
8517 if (maxchar_new > maxchar_old) {
8518 /* If the maxchar increased so that the kind changed, not all
8519 characters are representable anymore and we need to fix the
8520 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008521 if (PyUnicode_CopyCharacters(v, 0,
8522 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008523 PyUnicode_GET_LENGTH(self)) < 0)
8524 {
8525 Py_DECREF(u);
8526 return NULL;
8527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 maxchar_old = fixfct((PyUnicodeObject*)v);
8529 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8530 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008531 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008532 if (PyUnicode_CopyCharacters(v, 0,
8533 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008534 PyUnicode_GET_LENGTH(self)) < 0)
8535 {
8536 Py_DECREF(u);
8537 return NULL;
8538 }
8539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540
8541 Py_DECREF(u);
8542 return v;
8543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544}
8545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008547fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 /* No need to call PyUnicode_READY(self) because this function is only
8550 called as a callback from fixup() which does it already. */
8551 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8552 const int kind = PyUnicode_KIND(self);
8553 void *data = PyUnicode_DATA(self);
8554 int touched = 0;
8555 Py_UCS4 maxchar = 0;
8556 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 for (i = 0; i < len; ++i) {
8559 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8560 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8561 if (up != ch) {
8562 if (up > maxchar)
8563 maxchar = up;
8564 PyUnicode_WRITE(kind, data, i, up);
8565 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 else if (ch > maxchar)
8568 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 }
8570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 if (touched)
8572 return maxchar;
8573 else
8574 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575}
8576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008578fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8581 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8582 const int kind = PyUnicode_KIND(self);
8583 void *data = PyUnicode_DATA(self);
8584 int touched = 0;
8585 Py_UCS4 maxchar = 0;
8586 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 for(i = 0; i < len; ++i) {
8589 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8590 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8591 if (lo != ch) {
8592 if (lo > maxchar)
8593 maxchar = lo;
8594 PyUnicode_WRITE(kind, data, i, lo);
8595 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 else if (ch > maxchar)
8598 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 }
8600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 if (touched)
8602 return maxchar;
8603 else
8604 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605}
8606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008608fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8611 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8612 const int kind = PyUnicode_KIND(self);
8613 void *data = PyUnicode_DATA(self);
8614 int touched = 0;
8615 Py_UCS4 maxchar = 0;
8616 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 for(i = 0; i < len; ++i) {
8619 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8620 Py_UCS4 nu = 0;
8621
8622 if (Py_UNICODE_ISUPPER(ch))
8623 nu = Py_UNICODE_TOLOWER(ch);
8624 else if (Py_UNICODE_ISLOWER(ch))
8625 nu = Py_UNICODE_TOUPPER(ch);
8626
8627 if (nu != 0) {
8628 if (nu > maxchar)
8629 maxchar = nu;
8630 PyUnicode_WRITE(kind, data, i, nu);
8631 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 else if (ch > maxchar)
8634 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 if (touched)
8638 return maxchar;
8639 else
8640 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641}
8642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8647 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8648 const int kind = PyUnicode_KIND(self);
8649 void *data = PyUnicode_DATA(self);
8650 int touched = 0;
8651 Py_UCS4 maxchar = 0;
8652 Py_ssize_t i = 0;
8653 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008654
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008655 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657
8658 ch = PyUnicode_READ(kind, data, i);
8659 if (!Py_UNICODE_ISUPPER(ch)) {
8660 maxchar = Py_UNICODE_TOUPPER(ch);
8661 PyUnicode_WRITE(kind, data, i, maxchar);
8662 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 ++i;
8665 for(; i < len; ++i) {
8666 ch = PyUnicode_READ(kind, data, i);
8667 if (!Py_UNICODE_ISLOWER(ch)) {
8668 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8669 if (lo > maxchar)
8670 maxchar = lo;
8671 PyUnicode_WRITE(kind, data, i, lo);
8672 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 else if (ch > maxchar)
8675 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677
8678 if (touched)
8679 return maxchar;
8680 else
8681 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682}
8683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008685fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8688 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8689 const int kind = PyUnicode_KIND(self);
8690 void *data = PyUnicode_DATA(self);
8691 Py_UCS4 maxchar = 0;
8692 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 int previous_is_cased;
8694
8695 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 if (len == 1) {
8697 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8698 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8699 if (ti != ch) {
8700 PyUnicode_WRITE(kind, data, i, ti);
8701 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 }
8703 else
8704 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 for(; i < len; ++i) {
8708 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8709 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008710
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 nu = Py_UNICODE_TOTITLE(ch);
8715
8716 if (nu > maxchar)
8717 maxchar = nu;
8718 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008719
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 if (Py_UNICODE_ISLOWER(ch) ||
8721 Py_UNICODE_ISUPPER(ch) ||
8722 Py_UNICODE_ISTITLE(ch))
8723 previous_is_cased = 1;
8724 else
8725 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728}
8729
Tim Peters8ce9f162004-08-27 01:49:32 +00008730PyObject *
8731PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008734 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008736 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008737 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8738 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008739 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 Py_ssize_t sz, i, res_offset;
8741 Py_UCS4 maxchar = 0;
8742 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743
Tim Peters05eba1f2004-08-27 21:32:02 +00008744 fseq = PySequence_Fast(seq, "");
8745 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008746 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008747 }
8748
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008749 /* NOTE: the following code can't call back into Python code,
8750 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008751 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008752
Tim Peters05eba1f2004-08-27 21:32:02 +00008753 seqlen = PySequence_Fast_GET_SIZE(fseq);
8754 /* If empty sequence, return u"". */
8755 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008758 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008759 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008760 /* If singleton sequence with an exact Unicode, return that. */
8761 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 item = items[0];
8763 if (PyUnicode_CheckExact(item)) {
8764 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 goto Done;
8767 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008768 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008769 else {
8770 /* Set up sep and seplen */
8771 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 /* fall back to a blank space separator */
8773 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008774 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008776 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008777 else {
8778 if (!PyUnicode_Check(separator)) {
8779 PyErr_Format(PyExc_TypeError,
8780 "separator: expected str instance,"
8781 " %.80s found",
8782 Py_TYPE(separator)->tp_name);
8783 goto onError;
8784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 if (PyUnicode_READY(separator) == -1)
8786 goto onError;
8787 sep = separator;
8788 seplen = PyUnicode_GET_LENGTH(separator);
8789 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8790 /* inc refcount to keep this code path symetric with the
8791 above case of a blank separator */
8792 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008793 }
8794 }
8795
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008796 /* There are at least two things to join, or else we have a subclass
8797 * of str in the sequence.
8798 * Do a pre-pass to figure out the total amount of space we'll
8799 * need (sz), and see whether all argument are strings.
8800 */
8801 sz = 0;
8802 for (i = 0; i < seqlen; i++) {
8803 const Py_ssize_t old_sz = sz;
8804 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 if (!PyUnicode_Check(item)) {
8806 PyErr_Format(PyExc_TypeError,
8807 "sequence item %zd: expected str instance,"
8808 " %.80s found",
8809 i, Py_TYPE(item)->tp_name);
8810 goto onError;
8811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 if (PyUnicode_READY(item) == -1)
8813 goto onError;
8814 sz += PyUnicode_GET_LENGTH(item);
8815 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8816 if (item_maxchar > maxchar)
8817 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008818 if (i != 0)
8819 sz += seplen;
8820 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8821 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008823 goto onError;
8824 }
8825 }
Tim Petersced69f82003-09-16 20:30:58 +00008826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008828 if (res == NULL)
8829 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008830
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008831 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008833 Py_ssize_t itemlen;
8834 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 /* Copy item, and maybe the separator. */
8837 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008838 if (PyUnicode_CopyCharacters(res, res_offset,
8839 sep, 0, seplen) < 0)
8840 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008843 if (PyUnicode_CopyCharacters(res, res_offset,
8844 item, 0, itemlen) < 0)
8845 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008849
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008851 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 Py_XDECREF(sep);
8853 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008856 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008858 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 return NULL;
8860}
8861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862#define FILL(kind, data, value, start, length) \
8863 do { \
8864 Py_ssize_t i_ = 0; \
8865 assert(kind != PyUnicode_WCHAR_KIND); \
8866 switch ((kind)) { \
8867 case PyUnicode_1BYTE_KIND: { \
8868 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8869 memset(to_, (unsigned char)value, length); \
8870 break; \
8871 } \
8872 case PyUnicode_2BYTE_KIND: { \
8873 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8874 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8875 break; \
8876 } \
8877 default: { \
8878 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8879 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8880 break; \
8881 } \
8882 } \
8883 } while (0)
8884
Alexander Belopolsky40018472011-02-26 01:02:56 +00008885static PyUnicodeObject *
8886pad(PyUnicodeObject *self,
8887 Py_ssize_t left,
8888 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 PyObject *u;
8892 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008893 int kind;
8894 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895
8896 if (left < 0)
8897 left = 0;
8898 if (right < 0)
8899 right = 0;
8900
Tim Peters7a29bd52001-09-12 03:03:31 +00008901 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902 Py_INCREF(self);
8903 return self;
8904 }
8905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8907 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008908 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8909 return NULL;
8910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8912 if (fill > maxchar)
8913 maxchar = fill;
8914 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008915 if (!u)
8916 return NULL;
8917
8918 kind = PyUnicode_KIND(u);
8919 data = PyUnicode_DATA(u);
8920 if (left)
8921 FILL(kind, data, fill, 0, left);
8922 if (right)
8923 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008924 if (PyUnicode_CopyCharacters(u, left,
8925 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008926 _PyUnicode_LENGTH(self)) < 0)
8927 {
8928 Py_DECREF(u);
8929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930 }
8931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935
Alexander Belopolsky40018472011-02-26 01:02:56 +00008936PyObject *
8937PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940
8941 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 switch(PyUnicode_KIND(string)) {
8946 case PyUnicode_1BYTE_KIND:
8947 list = ucs1lib_splitlines(
8948 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8949 PyUnicode_GET_LENGTH(string), keepends);
8950 break;
8951 case PyUnicode_2BYTE_KIND:
8952 list = ucs2lib_splitlines(
8953 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8954 PyUnicode_GET_LENGTH(string), keepends);
8955 break;
8956 case PyUnicode_4BYTE_KIND:
8957 list = ucs4lib_splitlines(
8958 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8959 PyUnicode_GET_LENGTH(string), keepends);
8960 break;
8961 default:
8962 assert(0);
8963 list = 0;
8964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965 Py_DECREF(string);
8966 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967}
8968
Alexander Belopolsky40018472011-02-26 01:02:56 +00008969static PyObject *
8970split(PyUnicodeObject *self,
8971 PyUnicodeObject *substring,
8972 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 int kind1, kind2, kind;
8975 void *buf1, *buf2;
8976 Py_ssize_t len1, len2;
8977 PyObject* out;
8978
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008980 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 if (PyUnicode_READY(self) == -1)
8983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 if (substring == NULL)
8986 switch(PyUnicode_KIND(self)) {
8987 case PyUnicode_1BYTE_KIND:
8988 return ucs1lib_split_whitespace(
8989 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8990 PyUnicode_GET_LENGTH(self), maxcount
8991 );
8992 case PyUnicode_2BYTE_KIND:
8993 return ucs2lib_split_whitespace(
8994 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8995 PyUnicode_GET_LENGTH(self), maxcount
8996 );
8997 case PyUnicode_4BYTE_KIND:
8998 return ucs4lib_split_whitespace(
8999 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9000 PyUnicode_GET_LENGTH(self), maxcount
9001 );
9002 default:
9003 assert(0);
9004 return NULL;
9005 }
9006
9007 if (PyUnicode_READY(substring) == -1)
9008 return NULL;
9009
9010 kind1 = PyUnicode_KIND(self);
9011 kind2 = PyUnicode_KIND(substring);
9012 kind = kind1 > kind2 ? kind1 : kind2;
9013 buf1 = PyUnicode_DATA(self);
9014 buf2 = PyUnicode_DATA(substring);
9015 if (kind1 != kind)
9016 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9017 if (!buf1)
9018 return NULL;
9019 if (kind2 != kind)
9020 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9021 if (!buf2) {
9022 if (kind1 != kind) PyMem_Free(buf1);
9023 return NULL;
9024 }
9025 len1 = PyUnicode_GET_LENGTH(self);
9026 len2 = PyUnicode_GET_LENGTH(substring);
9027
9028 switch(kind) {
9029 case PyUnicode_1BYTE_KIND:
9030 out = ucs1lib_split(
9031 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9032 break;
9033 case PyUnicode_2BYTE_KIND:
9034 out = ucs2lib_split(
9035 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9036 break;
9037 case PyUnicode_4BYTE_KIND:
9038 out = ucs4lib_split(
9039 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9040 break;
9041 default:
9042 out = NULL;
9043 }
9044 if (kind1 != kind)
9045 PyMem_Free(buf1);
9046 if (kind2 != kind)
9047 PyMem_Free(buf2);
9048 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049}
9050
Alexander Belopolsky40018472011-02-26 01:02:56 +00009051static PyObject *
9052rsplit(PyUnicodeObject *self,
9053 PyUnicodeObject *substring,
9054 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 int kind1, kind2, kind;
9057 void *buf1, *buf2;
9058 Py_ssize_t len1, len2;
9059 PyObject* out;
9060
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009061 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009062 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 if (PyUnicode_READY(self) == -1)
9065 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (substring == NULL)
9068 switch(PyUnicode_KIND(self)) {
9069 case PyUnicode_1BYTE_KIND:
9070 return ucs1lib_rsplit_whitespace(
9071 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9072 PyUnicode_GET_LENGTH(self), maxcount
9073 );
9074 case PyUnicode_2BYTE_KIND:
9075 return ucs2lib_rsplit_whitespace(
9076 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9077 PyUnicode_GET_LENGTH(self), maxcount
9078 );
9079 case PyUnicode_4BYTE_KIND:
9080 return ucs4lib_rsplit_whitespace(
9081 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9082 PyUnicode_GET_LENGTH(self), maxcount
9083 );
9084 default:
9085 assert(0);
9086 return NULL;
9087 }
9088
9089 if (PyUnicode_READY(substring) == -1)
9090 return NULL;
9091
9092 kind1 = PyUnicode_KIND(self);
9093 kind2 = PyUnicode_KIND(substring);
9094 kind = kind1 > kind2 ? kind1 : kind2;
9095 buf1 = PyUnicode_DATA(self);
9096 buf2 = PyUnicode_DATA(substring);
9097 if (kind1 != kind)
9098 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9099 if (!buf1)
9100 return NULL;
9101 if (kind2 != kind)
9102 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9103 if (!buf2) {
9104 if (kind1 != kind) PyMem_Free(buf1);
9105 return NULL;
9106 }
9107 len1 = PyUnicode_GET_LENGTH(self);
9108 len2 = PyUnicode_GET_LENGTH(substring);
9109
9110 switch(kind) {
9111 case PyUnicode_1BYTE_KIND:
9112 out = ucs1lib_rsplit(
9113 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9114 break;
9115 case PyUnicode_2BYTE_KIND:
9116 out = ucs2lib_rsplit(
9117 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9118 break;
9119 case PyUnicode_4BYTE_KIND:
9120 out = ucs4lib_rsplit(
9121 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9122 break;
9123 default:
9124 out = NULL;
9125 }
9126 if (kind1 != kind)
9127 PyMem_Free(buf1);
9128 if (kind2 != kind)
9129 PyMem_Free(buf2);
9130 return out;
9131}
9132
9133static Py_ssize_t
9134anylib_find(int kind, void *buf1, Py_ssize_t len1,
9135 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9136{
9137 switch(kind) {
9138 case PyUnicode_1BYTE_KIND:
9139 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9140 case PyUnicode_2BYTE_KIND:
9141 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9142 case PyUnicode_4BYTE_KIND:
9143 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9144 }
9145 assert(0);
9146 return -1;
9147}
9148
9149static Py_ssize_t
9150anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9151 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9152{
9153 switch(kind) {
9154 case PyUnicode_1BYTE_KIND:
9155 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9156 case PyUnicode_2BYTE_KIND:
9157 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9158 case PyUnicode_4BYTE_KIND:
9159 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9160 }
9161 assert(0);
9162 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009163}
9164
Alexander Belopolsky40018472011-02-26 01:02:56 +00009165static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166replace(PyObject *self, PyObject *str1,
9167 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169 PyObject *u;
9170 char *sbuf = PyUnicode_DATA(self);
9171 char *buf1 = PyUnicode_DATA(str1);
9172 char *buf2 = PyUnicode_DATA(str2);
9173 int srelease = 0, release1 = 0, release2 = 0;
9174 int skind = PyUnicode_KIND(self);
9175 int kind1 = PyUnicode_KIND(str1);
9176 int kind2 = PyUnicode_KIND(str2);
9177 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9178 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9179 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180
9181 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009184 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 if (skind < kind1)
9187 /* substring too wide to be present */
9188 goto nothing;
9189
9190 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009191 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009192 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009194 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009196 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 Py_UCS4 u1, u2, maxchar;
9198 int mayshrink, rkind;
9199 u1 = PyUnicode_READ_CHAR(str1, 0);
9200 if (!findchar(sbuf, PyUnicode_KIND(self),
9201 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009202 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 u2 = PyUnicode_READ_CHAR(str2, 0);
9204 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9205 /* Replacing u1 with u2 may cause a maxchar reduction in the
9206 result string. */
9207 mayshrink = maxchar > 127;
9208 if (u2 > maxchar) {
9209 maxchar = u2;
9210 mayshrink = 0;
9211 }
9212 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009215 if (PyUnicode_CopyCharacters(u, 0,
9216 (PyObject*)self, 0, slen) < 0)
9217 {
9218 Py_DECREF(u);
9219 return NULL;
9220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 rkind = PyUnicode_KIND(u);
9222 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9223 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224 if (--maxcount < 0)
9225 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 if (mayshrink) {
9229 PyObject *tmp = u;
9230 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9231 PyUnicode_GET_LENGTH(tmp));
9232 Py_DECREF(tmp);
9233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235 int rkind = skind;
9236 char *res;
9237 if (kind1 < rkind) {
9238 /* widen substring */
9239 buf1 = _PyUnicode_AsKind(str1, rkind);
9240 if (!buf1) goto error;
9241 release1 = 1;
9242 }
9243 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009244 if (i < 0)
9245 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 if (rkind > kind2) {
9247 /* widen replacement */
9248 buf2 = _PyUnicode_AsKind(str2, rkind);
9249 if (!buf2) goto error;
9250 release2 = 1;
9251 }
9252 else if (rkind < kind2) {
9253 /* widen self and buf1 */
9254 rkind = kind2;
9255 if (release1) PyMem_Free(buf1);
9256 sbuf = _PyUnicode_AsKind(self, rkind);
9257 if (!sbuf) goto error;
9258 srelease = 1;
9259 buf1 = _PyUnicode_AsKind(str1, rkind);
9260 if (!buf1) goto error;
9261 release1 = 1;
9262 }
9263 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9264 if (!res) {
9265 PyErr_NoMemory();
9266 goto error;
9267 }
9268 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009269 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9271 buf2,
9272 PyUnicode_KIND_SIZE(rkind, len2));
9273 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009274
9275 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9277 slen-i,
9278 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009279 if (i == -1)
9280 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9282 buf2,
9283 PyUnicode_KIND_SIZE(rkind, len2));
9284 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286
9287 u = PyUnicode_FromKindAndData(rkind, res, slen);
9288 PyMem_Free(res);
9289 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 Py_ssize_t n, i, j, ires;
9294 Py_ssize_t product, new_size;
9295 int rkind = skind;
9296 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (kind1 < rkind) {
9299 buf1 = _PyUnicode_AsKind(str1, rkind);
9300 if (!buf1) goto error;
9301 release1 = 1;
9302 }
9303 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009304 if (n == 0)
9305 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (kind2 < rkind) {
9307 buf2 = _PyUnicode_AsKind(str2, rkind);
9308 if (!buf2) goto error;
9309 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311 else if (kind2 > rkind) {
9312 rkind = kind2;
9313 sbuf = _PyUnicode_AsKind(self, rkind);
9314 if (!sbuf) goto error;
9315 srelease = 1;
9316 if (release1) PyMem_Free(buf1);
9317 buf1 = _PyUnicode_AsKind(str1, rkind);
9318 if (!buf1) goto error;
9319 release1 = 1;
9320 }
9321 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9322 PyUnicode_GET_LENGTH(str1))); */
9323 product = n * (len2-len1);
9324 if ((product / (len2-len1)) != n) {
9325 PyErr_SetString(PyExc_OverflowError,
9326 "replace string is too long");
9327 goto error;
9328 }
9329 new_size = slen + product;
9330 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9331 PyErr_SetString(PyExc_OverflowError,
9332 "replace string is too long");
9333 goto error;
9334 }
9335 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9336 if (!res)
9337 goto error;
9338 ires = i = 0;
9339 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009340 while (n-- > 0) {
9341 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 j = anylib_find(rkind,
9343 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9344 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009345 if (j == -1)
9346 break;
9347 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009348 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9350 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9351 PyUnicode_KIND_SIZE(rkind, j-i));
9352 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009353 }
9354 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 if (len2 > 0) {
9356 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9357 buf2,
9358 PyUnicode_KIND_SIZE(rkind, len2));
9359 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9366 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9367 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009368 } else {
9369 /* interleave */
9370 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9372 buf2,
9373 PyUnicode_KIND_SIZE(rkind, len2));
9374 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009375 if (--n <= 0)
9376 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9378 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9379 PyUnicode_KIND_SIZE(rkind, 1));
9380 ires++;
9381 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9384 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9385 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009388 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (srelease)
9391 PyMem_FREE(sbuf);
9392 if (release1)
9393 PyMem_FREE(buf1);
9394 if (release2)
9395 PyMem_FREE(buf2);
9396 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009397
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 if (srelease)
9401 PyMem_FREE(sbuf);
9402 if (release1)
9403 PyMem_FREE(buf1);
9404 if (release2)
9405 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009406 if (PyUnicode_CheckExact(self)) {
9407 Py_INCREF(self);
9408 return (PyObject *) self;
9409 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009410 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 error:
9412 if (srelease && sbuf)
9413 PyMem_FREE(sbuf);
9414 if (release1 && buf1)
9415 PyMem_FREE(buf1);
9416 if (release2 && buf2)
9417 PyMem_FREE(buf2);
9418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419}
9420
9421/* --- Unicode Object Methods --------------------------------------------- */
9422
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009423PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425\n\
9426Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009427characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428
9429static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009430unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432 return fixup(self, fixtitle);
9433}
9434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009435PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009436 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437\n\
9438Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009439have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440
9441static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009442unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444 return fixup(self, fixcapitalize);
9445}
9446
9447#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009448PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450\n\
9451Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009452normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453
9454static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009455unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
9457 PyObject *list;
9458 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009459 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461 /* Split into words */
9462 list = split(self, NULL, -1);
9463 if (!list)
9464 return NULL;
9465
9466 /* Capitalize each word */
9467 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9468 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 if (item == NULL)
9471 goto onError;
9472 Py_DECREF(PyList_GET_ITEM(list, i));
9473 PyList_SET_ITEM(list, i, item);
9474 }
9475
9476 /* Join the words to form a new string */
9477 item = PyUnicode_Join(NULL, list);
9478
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 Py_DECREF(list);
9481 return (PyObject *)item;
9482}
9483#endif
9484
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009485/* Argument converter. Coerces to a single unicode character */
9486
9487static int
9488convert_uc(PyObject *obj, void *addr)
9489{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009491 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009492
Benjamin Peterson14339b62009-01-31 16:36:08 +00009493 uniobj = PyUnicode_FromObject(obj);
9494 if (uniobj == NULL) {
9495 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009497 return 0;
9498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009500 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 Py_DECREF(uniobj);
9503 return 0;
9504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009506 Py_DECREF(uniobj);
9507 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009508}
9509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009510PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009513Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009514done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515
9516static PyObject *
9517unicode_center(PyUnicodeObject *self, PyObject *args)
9518{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009519 Py_ssize_t marg, left;
9520 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 Py_UCS4 fillchar = ' ';
9522
Victor Stinnere9a29352011-10-01 02:14:59 +02009523 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525
Victor Stinnere9a29352011-10-01 02:14:59 +02009526 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 return NULL;
9528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 Py_INCREF(self);
9531 return (PyObject*) self;
9532 }
9533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 left = marg / 2 + (marg & width & 1);
9536
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009537 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538}
9539
Marc-André Lemburge5034372000-08-08 08:04:29 +00009540#if 0
9541
9542/* This code should go into some future Unicode collation support
9543 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009544 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009545
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009546/* speedy UTF-16 code point order comparison */
9547/* gleaned from: */
9548/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9549
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009550static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009551{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009552 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009553 0, 0, 0, 0, 0, 0, 0, 0,
9554 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009555 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009556};
9557
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558static int
9559unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9560{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009561 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009562
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563 Py_UNICODE *s1 = str1->str;
9564 Py_UNICODE *s2 = str2->str;
9565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 len1 = str1->_base._base.length;
9567 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009570 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009571
9572 c1 = *s1++;
9573 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009574
Benjamin Peterson29060642009-01-31 22:14:21 +00009575 if (c1 > (1<<11) * 26)
9576 c1 += utf16Fixup[c1>>11];
9577 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009578 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009579 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009580
9581 if (c1 != c2)
9582 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009583
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009584 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585 }
9586
9587 return (len1 < len2) ? -1 : (len1 != len2);
9588}
9589
Marc-André Lemburge5034372000-08-08 08:04:29 +00009590#else
9591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592/* This function assumes that str1 and str2 are readied by the caller. */
9593
Marc-André Lemburge5034372000-08-08 08:04:29 +00009594static int
9595unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9596{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 int kind1, kind2;
9598 void *data1, *data2;
9599 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 kind1 = PyUnicode_KIND(str1);
9602 kind2 = PyUnicode_KIND(str2);
9603 data1 = PyUnicode_DATA(str1);
9604 data2 = PyUnicode_DATA(str2);
9605 len1 = PyUnicode_GET_LENGTH(str1);
9606 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 for (i = 0; i < len1 && i < len2; ++i) {
9609 Py_UCS4 c1, c2;
9610 c1 = PyUnicode_READ(kind1, data1, i);
9611 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009612
9613 if (c1 != c2)
9614 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009615 }
9616
9617 return (len1 < len2) ? -1 : (len1 != len2);
9618}
9619
9620#endif
9621
Alexander Belopolsky40018472011-02-26 01:02:56 +00009622int
9623PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9626 if (PyUnicode_READY(left) == -1 ||
9627 PyUnicode_READY(right) == -1)
9628 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009629 return unicode_compare((PyUnicodeObject *)left,
9630 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009632 PyErr_Format(PyExc_TypeError,
9633 "Can't compare %.100s and %.100s",
9634 left->ob_type->tp_name,
9635 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 return -1;
9637}
9638
Martin v. Löwis5b222132007-06-10 09:51:05 +00009639int
9640PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9641{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 Py_ssize_t i;
9643 int kind;
9644 void *data;
9645 Py_UCS4 chr;
9646
Victor Stinner910337b2011-10-03 03:20:16 +02009647 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 if (PyUnicode_READY(uni) == -1)
9649 return -1;
9650 kind = PyUnicode_KIND(uni);
9651 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009652 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9654 if (chr != str[i])
9655 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009656 /* This check keeps Python strings that end in '\0' from comparing equal
9657 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009660 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009662 return 0;
9663}
9664
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009665
Benjamin Peterson29060642009-01-31 22:14:21 +00009666#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009667 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009668
Alexander Belopolsky40018472011-02-26 01:02:56 +00009669PyObject *
9670PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009671{
9672 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009673
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009674 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9675 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 if (PyUnicode_READY(left) == -1 ||
9677 PyUnicode_READY(right) == -1)
9678 return NULL;
9679 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9680 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009681 if (op == Py_EQ) {
9682 Py_INCREF(Py_False);
9683 return Py_False;
9684 }
9685 if (op == Py_NE) {
9686 Py_INCREF(Py_True);
9687 return Py_True;
9688 }
9689 }
9690 if (left == right)
9691 result = 0;
9692 else
9693 result = unicode_compare((PyUnicodeObject *)left,
9694 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009695
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009696 /* Convert the return value to a Boolean */
9697 switch (op) {
9698 case Py_EQ:
9699 v = TEST_COND(result == 0);
9700 break;
9701 case Py_NE:
9702 v = TEST_COND(result != 0);
9703 break;
9704 case Py_LE:
9705 v = TEST_COND(result <= 0);
9706 break;
9707 case Py_GE:
9708 v = TEST_COND(result >= 0);
9709 break;
9710 case Py_LT:
9711 v = TEST_COND(result == -1);
9712 break;
9713 case Py_GT:
9714 v = TEST_COND(result == 1);
9715 break;
9716 default:
9717 PyErr_BadArgument();
9718 return NULL;
9719 }
9720 Py_INCREF(v);
9721 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009723
Brian Curtindfc80e32011-08-10 20:28:54 -05009724 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009725}
9726
Alexander Belopolsky40018472011-02-26 01:02:56 +00009727int
9728PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009729{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009730 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 int kind1, kind2, kind;
9732 void *buf1, *buf2;
9733 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009734 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009735
9736 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009737 sub = PyUnicode_FromObject(element);
9738 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 PyErr_Format(PyExc_TypeError,
9740 "'in <string>' requires string as left operand, not %s",
9741 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009742 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 if (PyUnicode_READY(sub) == -1)
9745 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009746
Thomas Wouters477c8d52006-05-27 19:21:47 +00009747 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009748 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009749 Py_DECREF(sub);
9750 return -1;
9751 }
9752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009753 kind1 = PyUnicode_KIND(str);
9754 kind2 = PyUnicode_KIND(sub);
9755 kind = kind1 > kind2 ? kind1 : kind2;
9756 buf1 = PyUnicode_DATA(str);
9757 buf2 = PyUnicode_DATA(sub);
9758 if (kind1 != kind)
9759 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9760 if (!buf1) {
9761 Py_DECREF(sub);
9762 return -1;
9763 }
9764 if (kind2 != kind)
9765 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9766 if (!buf2) {
9767 Py_DECREF(sub);
9768 if (kind1 != kind) PyMem_Free(buf1);
9769 return -1;
9770 }
9771 len1 = PyUnicode_GET_LENGTH(str);
9772 len2 = PyUnicode_GET_LENGTH(sub);
9773
9774 switch(kind) {
9775 case PyUnicode_1BYTE_KIND:
9776 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9777 break;
9778 case PyUnicode_2BYTE_KIND:
9779 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9780 break;
9781 case PyUnicode_4BYTE_KIND:
9782 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9783 break;
9784 default:
9785 result = -1;
9786 assert(0);
9787 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009788
9789 Py_DECREF(str);
9790 Py_DECREF(sub);
9791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 if (kind1 != kind)
9793 PyMem_Free(buf1);
9794 if (kind2 != kind)
9795 PyMem_Free(buf2);
9796
Guido van Rossum403d68b2000-03-13 15:55:09 +00009797 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009798}
9799
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800/* Concat to string or Unicode object giving a new Unicode object. */
9801
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802PyObject *
9803PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 PyObject *u = NULL, *v = NULL, *w;
9806 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807
9808 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815
9816 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009817 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009821 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824 }
9825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009827 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 w = PyUnicode_New(
9831 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9832 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009834 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009835 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9836 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009837 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009838 v, 0,
9839 PyUnicode_GET_LENGTH(v)) < 0)
9840 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841 Py_DECREF(u);
9842 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846 Py_XDECREF(u);
9847 Py_XDECREF(v);
9848 return NULL;
9849}
9850
Walter Dörwald1ab83302007-05-18 17:15:44 +00009851void
Victor Stinner23e56682011-10-03 03:54:37 +02009852PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009853{
Victor Stinner23e56682011-10-03 03:54:37 +02009854 PyObject *left, *res;
9855
9856 if (p_left == NULL) {
9857 if (!PyErr_Occurred())
9858 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009859 return;
9860 }
Victor Stinner23e56682011-10-03 03:54:37 +02009861 left = *p_left;
9862 if (right == NULL || !PyUnicode_Check(left)) {
9863 if (!PyErr_Occurred())
9864 PyErr_BadInternalCall();
9865 goto error;
9866 }
9867
9868 if (PyUnicode_CheckExact(left) && left != unicode_empty
9869 && PyUnicode_CheckExact(right) && right != unicode_empty
9870 && unicode_resizable(left)
9871 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9872 || _PyUnicode_WSTR(left) != NULL))
9873 {
9874 Py_ssize_t u_len, v_len, new_len, copied;
9875
9876 /* FIXME: don't make wstr string ready */
9877 if (PyUnicode_READY(left))
9878 goto error;
9879 if (PyUnicode_READY(right))
9880 goto error;
9881
9882 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9883 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9884 {
9885 u_len = PyUnicode_GET_LENGTH(left);
9886 v_len = PyUnicode_GET_LENGTH(right);
9887 if (u_len > PY_SSIZE_T_MAX - v_len) {
9888 PyErr_SetString(PyExc_OverflowError,
9889 "strings are too large to concat");
9890 goto error;
9891 }
9892 new_len = u_len + v_len;
9893
9894 /* Now we own the last reference to 'left', so we can resize it
9895 * in-place.
9896 */
9897 if (unicode_resize(&left, new_len) != 0) {
9898 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9899 * deallocated so it cannot be put back into
9900 * 'variable'. The MemoryError is raised when there
9901 * is no value in 'variable', which might (very
9902 * remotely) be a cause of incompatibilities.
9903 */
9904 goto error;
9905 }
9906 /* copy 'right' into the newly allocated area of 'left' */
9907 copied = PyUnicode_CopyCharacters(left, u_len,
9908 right, 0,
9909 v_len);
9910 assert(0 <= copied);
9911 *p_left = left;
9912 return;
9913 }
9914 }
9915
9916 res = PyUnicode_Concat(left, right);
9917 if (res == NULL)
9918 goto error;
9919 Py_DECREF(left);
9920 *p_left = res;
9921 return;
9922
9923error:
9924 Py_DECREF(*p_left);
9925 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009926}
9927
9928void
9929PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9930{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 PyUnicode_Append(pleft, right);
9932 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009933}
9934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009935PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009938Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009939string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009940interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941
9942static PyObject *
9943unicode_count(PyUnicodeObject *self, PyObject *args)
9944{
9945 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009946 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009947 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 int kind1, kind2, kind;
9950 void *buf1, *buf2;
9951 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
Jesus Ceaac451502011-04-20 17:09:23 +02009953 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9954 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 kind1 = PyUnicode_KIND(self);
9958 kind2 = PyUnicode_KIND(substring);
9959 kind = kind1 > kind2 ? kind1 : kind2;
9960 buf1 = PyUnicode_DATA(self);
9961 buf2 = PyUnicode_DATA(substring);
9962 if (kind1 != kind)
9963 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9964 if (!buf1) {
9965 Py_DECREF(substring);
9966 return NULL;
9967 }
9968 if (kind2 != kind)
9969 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9970 if (!buf2) {
9971 Py_DECREF(substring);
9972 if (kind1 != kind) PyMem_Free(buf1);
9973 return NULL;
9974 }
9975 len1 = PyUnicode_GET_LENGTH(self);
9976 len2 = PyUnicode_GET_LENGTH(substring);
9977
9978 ADJUST_INDICES(start, end, len1);
9979 switch(kind) {
9980 case PyUnicode_1BYTE_KIND:
9981 iresult = ucs1lib_count(
9982 ((Py_UCS1*)buf1) + start, end - start,
9983 buf2, len2, PY_SSIZE_T_MAX
9984 );
9985 break;
9986 case PyUnicode_2BYTE_KIND:
9987 iresult = ucs2lib_count(
9988 ((Py_UCS2*)buf1) + start, end - start,
9989 buf2, len2, PY_SSIZE_T_MAX
9990 );
9991 break;
9992 case PyUnicode_4BYTE_KIND:
9993 iresult = ucs4lib_count(
9994 ((Py_UCS4*)buf1) + start, end - start,
9995 buf2, len2, PY_SSIZE_T_MAX
9996 );
9997 break;
9998 default:
9999 assert(0); iresult = 0;
10000 }
10001
10002 result = PyLong_FromSsize_t(iresult);
10003
10004 if (kind1 != kind)
10005 PyMem_Free(buf1);
10006 if (kind2 != kind)
10007 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008
10009 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010010
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011 return result;
10012}
10013
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010014PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010015 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010017Encode S using the codec registered for encoding. Default encoding\n\
10018is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010019handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010020a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10021'xmlcharrefreplace' as well as any other name registered with\n\
10022codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
10024static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010025unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010027 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028 char *encoding = NULL;
10029 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010030
Benjamin Peterson308d6372009-09-18 21:42:35 +000010031 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10032 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010034 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010035}
10036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010037PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039\n\
10040Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010041If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042
10043static PyObject*
10044unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10045{
10046 Py_UNICODE *e;
10047 Py_UNICODE *p;
10048 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010049 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051 PyUnicodeObject *u;
10052 int tabsize = 8;
10053
10054 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10058 return NULL;
10059
Thomas Wouters7e474022000-07-16 12:04:32 +000010060 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010061 i = 0; /* chars up to and including most recent \n or \r */
10062 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10064 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 if (tabsize > 0) {
10067 incr = tabsize - (j % tabsize); /* cannot overflow */
10068 if (j > PY_SSIZE_T_MAX - incr)
10069 goto overflow1;
10070 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010071 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 if (j > PY_SSIZE_T_MAX - 1)
10075 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076 j++;
10077 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 if (i > PY_SSIZE_T_MAX - j)
10079 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010081 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 }
10083 }
10084
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010085 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010086 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010087
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088 /* Second pass: create output string and fill it */
10089 u = _PyUnicode_New(i + j);
10090 if (!u)
10091 return NULL;
10092
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010093 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 q = _PyUnicode_WSTR(u); /* next output char */
10095 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010099 if (tabsize > 0) {
10100 i = tabsize - (j % tabsize);
10101 j += i;
10102 while (i--) {
10103 if (q >= qe)
10104 goto overflow2;
10105 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010106 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010108 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 else {
10110 if (q >= qe)
10111 goto overflow2;
10112 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010113 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114 if (*p == '\n' || *p == '\r')
10115 j = 0;
10116 }
10117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 if (PyUnicode_READY(u) == -1) {
10119 Py_DECREF(u);
10120 return NULL;
10121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010123
10124 overflow2:
10125 Py_DECREF(u);
10126 overflow1:
10127 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129}
10130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010131PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010132 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133\n\
10134Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010135such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136arguments start and end are interpreted as in slice notation.\n\
10137\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010138Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
10140static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142{
Jesus Ceaac451502011-04-20 17:09:23 +020010143 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010144 Py_ssize_t start;
10145 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010146 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147
Jesus Ceaac451502011-04-20 17:09:23 +020010148 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10149 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (PyUnicode_READY(self) == -1)
10153 return NULL;
10154 if (PyUnicode_READY(substring) == -1)
10155 return NULL;
10156
10157 result = any_find_slice(
10158 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10159 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010160 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
10162 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (result == -2)
10165 return NULL;
10166
Christian Heimes217cfd12007-12-02 14:31:20 +000010167 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
10170static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010171unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010173 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10174 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177}
10178
Guido van Rossumc2504932007-09-18 19:42:40 +000010179/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010180 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010181static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010182unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183{
Guido van Rossumc2504932007-09-18 19:42:40 +000010184 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010185 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (_PyUnicode_HASH(self) != -1)
10188 return _PyUnicode_HASH(self);
10189 if (PyUnicode_READY(self) == -1)
10190 return -1;
10191 len = PyUnicode_GET_LENGTH(self);
10192
10193 /* The hash function as a macro, gets expanded three times below. */
10194#define HASH(P) \
10195 x = (Py_uhash_t)*P << 7; \
10196 while (--len >= 0) \
10197 x = (1000003*x) ^ (Py_uhash_t)*P++;
10198
10199 switch (PyUnicode_KIND(self)) {
10200 case PyUnicode_1BYTE_KIND: {
10201 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10202 HASH(c);
10203 break;
10204 }
10205 case PyUnicode_2BYTE_KIND: {
10206 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10207 HASH(s);
10208 break;
10209 }
10210 default: {
10211 Py_UCS4 *l;
10212 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10213 "Impossible switch case in unicode_hash");
10214 l = PyUnicode_4BYTE_DATA(self);
10215 HASH(l);
10216 break;
10217 }
10218 }
10219 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10220
Guido van Rossumc2504932007-09-18 19:42:40 +000010221 if (x == -1)
10222 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010224 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010228PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010229 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010231Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
10233static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010236 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010237 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010238 Py_ssize_t start;
10239 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240
Jesus Ceaac451502011-04-20 17:09:23 +020010241 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10242 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (PyUnicode_READY(self) == -1)
10246 return NULL;
10247 if (PyUnicode_READY(substring) == -1)
10248 return NULL;
10249
10250 result = any_find_slice(
10251 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10252 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010253 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254
10255 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (result == -2)
10258 return NULL;
10259
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260 if (result < 0) {
10261 PyErr_SetString(PyExc_ValueError, "substring not found");
10262 return NULL;
10263 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010264
Christian Heimes217cfd12007-12-02 14:31:20 +000010265 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266}
10267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010268PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010271Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010272at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273
10274static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010275unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 Py_ssize_t i, length;
10278 int kind;
10279 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 int cased;
10281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (PyUnicode_READY(self) == -1)
10283 return NULL;
10284 length = PyUnicode_GET_LENGTH(self);
10285 kind = PyUnicode_KIND(self);
10286 data = PyUnicode_DATA(self);
10287
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (length == 1)
10290 return PyBool_FromLong(
10291 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010293 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010296
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 for (i = 0; i < length; i++) {
10299 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010300
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10302 return PyBool_FromLong(0);
10303 else if (!cased && Py_UNICODE_ISLOWER(ch))
10304 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010306 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010309PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010310 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010312Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010313at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
10315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010316unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 Py_ssize_t i, length;
10319 int kind;
10320 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321 int cased;
10322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
10325 length = PyUnicode_GET_LENGTH(self);
10326 kind = PyUnicode_KIND(self);
10327 data = PyUnicode_DATA(self);
10328
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 if (length == 1)
10331 return PyBool_FromLong(
10332 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010334 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010336 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010337
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 for (i = 0; i < length; i++) {
10340 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010341
Benjamin Peterson29060642009-01-31 22:14:21 +000010342 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10343 return PyBool_FromLong(0);
10344 else if (!cased && Py_UNICODE_ISUPPER(ch))
10345 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010347 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348}
10349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010350PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010351 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010353Return True if S is a titlecased string and there is at least one\n\
10354character in S, i.e. upper- and titlecase characters may only\n\
10355follow uncased characters and lowercase characters only cased ones.\n\
10356Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357
10358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010359unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 Py_ssize_t i, length;
10362 int kind;
10363 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 int cased, previous_is_cased;
10365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 if (PyUnicode_READY(self) == -1)
10367 return NULL;
10368 length = PyUnicode_GET_LENGTH(self);
10369 kind = PyUnicode_KIND(self);
10370 data = PyUnicode_DATA(self);
10371
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (length == 1) {
10374 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10375 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10376 (Py_UNICODE_ISUPPER(ch) != 0));
10377 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010379 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010381 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010382
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383 cased = 0;
10384 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 for (i = 0; i < length; i++) {
10386 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010387
Benjamin Peterson29060642009-01-31 22:14:21 +000010388 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10389 if (previous_is_cased)
10390 return PyBool_FromLong(0);
10391 previous_is_cased = 1;
10392 cased = 1;
10393 }
10394 else if (Py_UNICODE_ISLOWER(ch)) {
10395 if (!previous_is_cased)
10396 return PyBool_FromLong(0);
10397 previous_is_cased = 1;
10398 cased = 1;
10399 }
10400 else
10401 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010403 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404}
10405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010406PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010407 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010409Return True if all characters in S are whitespace\n\
10410and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411
10412static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010413unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 Py_ssize_t i, length;
10416 int kind;
10417 void *data;
10418
10419 if (PyUnicode_READY(self) == -1)
10420 return NULL;
10421 length = PyUnicode_GET_LENGTH(self);
10422 kind = PyUnicode_KIND(self);
10423 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (length == 1)
10427 return PyBool_FromLong(
10428 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010430 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 for (i = 0; i < length; i++) {
10435 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010436 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010437 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010439 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440}
10441
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010442PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010443 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010444\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010445Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010446and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010447
10448static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010449unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 Py_ssize_t i, length;
10452 int kind;
10453 void *data;
10454
10455 if (PyUnicode_READY(self) == -1)
10456 return NULL;
10457 length = PyUnicode_GET_LENGTH(self);
10458 kind = PyUnicode_KIND(self);
10459 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010460
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010461 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (length == 1)
10463 return PyBool_FromLong(
10464 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010465
10466 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010468 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 for (i = 0; i < length; i++) {
10471 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010473 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010474 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010475}
10476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010477PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010478 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010479\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010480Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010481and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010482
10483static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010484unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 int kind;
10487 void *data;
10488 Py_ssize_t len, i;
10489
10490 if (PyUnicode_READY(self) == -1)
10491 return NULL;
10492
10493 kind = PyUnicode_KIND(self);
10494 data = PyUnicode_DATA(self);
10495 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010496
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010497 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (len == 1) {
10499 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10500 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10501 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010502
10503 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 for (i = 0; i < len; i++) {
10508 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010509 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010510 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010511 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010512 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010513}
10514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010515PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010518Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010519False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
10521static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010522unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 Py_ssize_t i, length;
10525 int kind;
10526 void *data;
10527
10528 if (PyUnicode_READY(self) == -1)
10529 return NULL;
10530 length = PyUnicode_GET_LENGTH(self);
10531 kind = PyUnicode_KIND(self);
10532 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (length == 1)
10536 return PyBool_FromLong(
10537 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010539 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 for (i = 0; i < length; i++) {
10544 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010547 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548}
10549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010550PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010551 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010553Return True if all characters in S are digits\n\
10554and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555
10556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010557unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 Py_ssize_t i, length;
10560 int kind;
10561 void *data;
10562
10563 if (PyUnicode_READY(self) == -1)
10564 return NULL;
10565 length = PyUnicode_GET_LENGTH(self);
10566 kind = PyUnicode_KIND(self);
10567 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (length == 1) {
10571 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10572 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010575 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 for (i = 0; i < length; i++) {
10580 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010583 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584}
10585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010586PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010589Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010590False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591
10592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010593unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 Py_ssize_t i, length;
10596 int kind;
10597 void *data;
10598
10599 if (PyUnicode_READY(self) == -1)
10600 return NULL;
10601 length = PyUnicode_GET_LENGTH(self);
10602 kind = PyUnicode_KIND(self);
10603 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (length == 1)
10607 return PyBool_FromLong(
10608 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010610 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010612 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 for (i = 0; i < length; i++) {
10615 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010616 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619}
10620
Martin v. Löwis47383402007-08-15 07:32:56 +000010621int
10622PyUnicode_IsIdentifier(PyObject *self)
10623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 int kind;
10625 void *data;
10626 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010627 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (PyUnicode_READY(self) == -1) {
10630 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010631 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 }
10633
10634 /* Special case for empty strings */
10635 if (PyUnicode_GET_LENGTH(self) == 0)
10636 return 0;
10637 kind = PyUnicode_KIND(self);
10638 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010639
10640 /* PEP 3131 says that the first character must be in
10641 XID_Start and subsequent characters in XID_Continue,
10642 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010643 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010644 letters, digits, underscore). However, given the current
10645 definition of XID_Start and XID_Continue, it is sufficient
10646 to check just for these, except that _ must be allowed
10647 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010649 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010650 return 0;
10651
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010652 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010655 return 1;
10656}
10657
10658PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010660\n\
10661Return True if S is a valid identifier according\n\
10662to the language definition.");
10663
10664static PyObject*
10665unicode_isidentifier(PyObject *self)
10666{
10667 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10668}
10669
Georg Brandl559e5d72008-06-11 18:37:52 +000010670PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010672\n\
10673Return True if all characters in S are considered\n\
10674printable in repr() or S is empty, False otherwise.");
10675
10676static PyObject*
10677unicode_isprintable(PyObject *self)
10678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 Py_ssize_t i, length;
10680 int kind;
10681 void *data;
10682
10683 if (PyUnicode_READY(self) == -1)
10684 return NULL;
10685 length = PyUnicode_GET_LENGTH(self);
10686 kind = PyUnicode_KIND(self);
10687 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010688
10689 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (length == 1)
10691 return PyBool_FromLong(
10692 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 for (i = 0; i < length; i++) {
10695 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010696 Py_RETURN_FALSE;
10697 }
10698 }
10699 Py_RETURN_TRUE;
10700}
10701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010702PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010703 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704\n\
10705Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010706iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707
10708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010709unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010711 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712}
10713
Martin v. Löwis18e16552006-02-15 17:27:45 +000010714static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715unicode_length(PyUnicodeObject *self)
10716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (PyUnicode_READY(self) == -1)
10718 return -1;
10719 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720}
10721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010722PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010725Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010726done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
10728static PyObject *
10729unicode_ljust(PyUnicodeObject *self, PyObject *args)
10730{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010731 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 Py_UCS4 fillchar = ' ';
10733
10734 if (PyUnicode_READY(self) == -1)
10735 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010736
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010737 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 return NULL;
10739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 Py_INCREF(self);
10742 return (PyObject*) self;
10743 }
10744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746}
10747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
10753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010754unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 return fixup(self, fixlower);
10757}
10758
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010759#define LEFTSTRIP 0
10760#define RIGHTSTRIP 1
10761#define BOTHSTRIP 2
10762
10763/* Arrays indexed by above */
10764static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10765
10766#define STRIPNAME(i) (stripformat[i]+3)
10767
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010768/* externally visible for str.strip(unicode) */
10769PyObject *
10770_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 void *data;
10773 int kind;
10774 Py_ssize_t i, j, len;
10775 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10778 return NULL;
10779
10780 kind = PyUnicode_KIND(self);
10781 data = PyUnicode_DATA(self);
10782 len = PyUnicode_GET_LENGTH(self);
10783 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10784 PyUnicode_DATA(sepobj),
10785 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010786
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 i = 0;
10788 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 while (i < len &&
10790 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010791 i++;
10792 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010793 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010794
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 j = len;
10796 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 do {
10798 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 } while (j >= i &&
10800 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010802 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010803
Victor Stinner12bab6d2011-10-01 01:53:49 +020010804 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805}
10806
10807PyObject*
10808PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10809{
10810 unsigned char *data;
10811 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010812 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813
Victor Stinnerde636f32011-10-01 03:55:54 +020010814 if (PyUnicode_READY(self) == -1)
10815 return NULL;
10816
10817 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10818
Victor Stinner12bab6d2011-10-01 01:53:49 +020010819 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010821 if (PyUnicode_CheckExact(self)) {
10822 Py_INCREF(self);
10823 return self;
10824 }
10825 else
10826 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 }
10828
Victor Stinner12bab6d2011-10-01 01:53:49 +020010829 length = end - start;
10830 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010831 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832
Victor Stinnerde636f32011-10-01 03:55:54 +020010833 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010834 PyErr_SetString(PyExc_IndexError, "string index out of range");
10835 return NULL;
10836 }
10837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 kind = PyUnicode_KIND(self);
10839 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010840 return PyUnicode_FromKindAndData(kind,
10841 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010842 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844
10845static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010846do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 int kind;
10849 void *data;
10850 Py_ssize_t len, i, j;
10851
10852 if (PyUnicode_READY(self) == -1)
10853 return NULL;
10854
10855 kind = PyUnicode_KIND(self);
10856 data = PyUnicode_DATA(self);
10857 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010858
Benjamin Peterson14339b62009-01-31 16:36:08 +000010859 i = 0;
10860 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010862 i++;
10863 }
10864 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010865
Benjamin Peterson14339b62009-01-31 16:36:08 +000010866 j = len;
10867 if (striptype != LEFTSTRIP) {
10868 do {
10869 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010871 j++;
10872 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010873
Victor Stinner12bab6d2011-10-01 01:53:49 +020010874 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875}
10876
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010877
10878static PyObject *
10879do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10880{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010881 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010882
Benjamin Peterson14339b62009-01-31 16:36:08 +000010883 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10884 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010885
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 if (sep != NULL && sep != Py_None) {
10887 if (PyUnicode_Check(sep))
10888 return _PyUnicode_XStrip(self, striptype, sep);
10889 else {
10890 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 "%s arg must be None or str",
10892 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893 return NULL;
10894 }
10895 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010896
Benjamin Peterson14339b62009-01-31 16:36:08 +000010897 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010898}
10899
10900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010901PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010903\n\
10904Return a copy of the string S with leading and trailing\n\
10905whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010906If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010907
10908static PyObject *
10909unicode_strip(PyUnicodeObject *self, PyObject *args)
10910{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010911 if (PyTuple_GET_SIZE(args) == 0)
10912 return do_strip(self, BOTHSTRIP); /* Common case */
10913 else
10914 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010915}
10916
10917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010918PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010920\n\
10921Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010922If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010923
10924static PyObject *
10925unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10926{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010927 if (PyTuple_GET_SIZE(args) == 0)
10928 return do_strip(self, LEFTSTRIP); /* Common case */
10929 else
10930 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010931}
10932
10933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010934PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010935 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010936\n\
10937Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010938If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010939
10940static PyObject *
10941unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10942{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010943 if (PyTuple_GET_SIZE(args) == 0)
10944 return do_strip(self, RIGHTSTRIP); /* Common case */
10945 else
10946 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010947}
10948
10949
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010951unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952{
10953 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Georg Brandl222de0f2009-04-12 12:01:50 +000010956 if (len < 1) {
10957 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010958 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
Tim Peters7a29bd52001-09-12 03:03:31 +000010961 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 /* no repeat, return original string */
10963 Py_INCREF(str);
10964 return (PyObject*) str;
10965 }
Tim Peters8f422462000-09-09 06:13:41 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (PyUnicode_READY(str) == -1)
10968 return NULL;
10969
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010970 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010971 PyErr_SetString(PyExc_OverflowError,
10972 "repeated string is too long");
10973 return NULL;
10974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 if (!u)
10979 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010980 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (PyUnicode_GET_LENGTH(str) == 1) {
10983 const int kind = PyUnicode_KIND(str);
10984 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10985 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010986 if (kind == PyUnicode_1BYTE_KIND)
10987 memset(to, (unsigned char)fill_char, len);
10988 else {
10989 for (n = 0; n < len; ++n)
10990 PyUnicode_WRITE(kind, to, n, fill_char);
10991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 }
10993 else {
10994 /* number of characters copied this far */
10995 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10996 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10997 char *to = (char *) PyUnicode_DATA(u);
10998 Py_MEMCPY(to, PyUnicode_DATA(str),
10999 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 n = (done <= nchars-done) ? done : nchars-done;
11002 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011003 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 }
11006
11007 return (PyObject*) u;
11008}
11009
Alexander Belopolsky40018472011-02-26 01:02:56 +000011010PyObject *
11011PyUnicode_Replace(PyObject *obj,
11012 PyObject *subobj,
11013 PyObject *replobj,
11014 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015{
11016 PyObject *self;
11017 PyObject *str1;
11018 PyObject *str2;
11019 PyObject *result;
11020
11021 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011022 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011025 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 Py_DECREF(self);
11027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 }
11029 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011030 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 Py_DECREF(self);
11032 Py_DECREF(str1);
11033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 Py_DECREF(self);
11037 Py_DECREF(str1);
11038 Py_DECREF(str2);
11039 return result;
11040}
11041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011042PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011043 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044\n\
11045Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011046old replaced by new. If the optional argument count is\n\
11047given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
11049static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 PyObject *str1;
11053 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011054 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 PyObject *result;
11056
Martin v. Löwis18e16552006-02-15 17:27:45 +000011057 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 str1 = PyUnicode_FromObject(str1);
11062 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11063 return NULL;
11064 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011065 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 Py_DECREF(str1);
11067 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070 result = replace(self, str1, str2, maxcount);
11071
11072 Py_DECREF(str1);
11073 Py_DECREF(str2);
11074 return result;
11075}
11076
Alexander Belopolsky40018472011-02-26 01:02:56 +000011077static PyObject *
11078unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011080 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 Py_ssize_t isize;
11082 Py_ssize_t osize, squote, dquote, i, o;
11083 Py_UCS4 max, quote;
11084 int ikind, okind;
11085 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011088 return NULL;
11089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 isize = PyUnicode_GET_LENGTH(unicode);
11091 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 /* Compute length of output, quote characters, and
11094 maximum character */
11095 osize = 2; /* quotes */
11096 max = 127;
11097 squote = dquote = 0;
11098 ikind = PyUnicode_KIND(unicode);
11099 for (i = 0; i < isize; i++) {
11100 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11101 switch (ch) {
11102 case '\'': squote++; osize++; break;
11103 case '"': dquote++; osize++; break;
11104 case '\\': case '\t': case '\r': case '\n':
11105 osize += 2; break;
11106 default:
11107 /* Fast-path ASCII */
11108 if (ch < ' ' || ch == 0x7f)
11109 osize += 4; /* \xHH */
11110 else if (ch < 0x7f)
11111 osize++;
11112 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11113 osize++;
11114 max = ch > max ? ch : max;
11115 }
11116 else if (ch < 0x100)
11117 osize += 4; /* \xHH */
11118 else if (ch < 0x10000)
11119 osize += 6; /* \uHHHH */
11120 else
11121 osize += 10; /* \uHHHHHHHH */
11122 }
11123 }
11124
11125 quote = '\'';
11126 if (squote) {
11127 if (dquote)
11128 /* Both squote and dquote present. Use squote,
11129 and escape them */
11130 osize += squote;
11131 else
11132 quote = '"';
11133 }
11134
11135 repr = PyUnicode_New(osize, max);
11136 if (repr == NULL)
11137 return NULL;
11138 okind = PyUnicode_KIND(repr);
11139 odata = PyUnicode_DATA(repr);
11140
11141 PyUnicode_WRITE(okind, odata, 0, quote);
11142 PyUnicode_WRITE(okind, odata, osize-1, quote);
11143
11144 for (i = 0, o = 1; i < isize; i++) {
11145 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011146
11147 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 if ((ch == quote) || (ch == '\\')) {
11149 PyUnicode_WRITE(okind, odata, o++, '\\');
11150 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011151 continue;
11152 }
11153
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011155 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 PyUnicode_WRITE(okind, odata, o++, '\\');
11157 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011158 }
11159 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 PyUnicode_WRITE(okind, odata, o++, '\\');
11161 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011162 }
11163 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 PyUnicode_WRITE(okind, odata, o++, '\\');
11165 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011166 }
11167
11168 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011169 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 PyUnicode_WRITE(okind, odata, o++, '\\');
11171 PyUnicode_WRITE(okind, odata, o++, 'x');
11172 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11173 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011174 }
11175
Georg Brandl559e5d72008-06-11 18:37:52 +000011176 /* Copy ASCII characters as-is */
11177 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011179 }
11180
Benjamin Peterson29060642009-01-31 22:14:21 +000011181 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011182 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011183 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011184 (categories Z* and C* except ASCII space)
11185 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011187 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 if (ch <= 0xff) {
11189 PyUnicode_WRITE(okind, odata, o++, '\\');
11190 PyUnicode_WRITE(okind, odata, o++, 'x');
11191 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11192 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011193 }
11194 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 else if (ch >= 0x10000) {
11196 PyUnicode_WRITE(okind, odata, o++, '\\');
11197 PyUnicode_WRITE(okind, odata, o++, 'U');
11198 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11199 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11200 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11201 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11202 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11204 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11205 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011206 }
11207 /* Map 16-bit characters to '\uxxxx' */
11208 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 PyUnicode_WRITE(okind, odata, o++, '\\');
11210 PyUnicode_WRITE(okind, odata, o++, 'u');
11211 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11212 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11213 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11214 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011215 }
11216 }
11217 /* Copy characters as-is */
11218 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011220 }
11221 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011224 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225}
11226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229\n\
11230Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011231such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232arguments start and end are interpreted as in slice notation.\n\
11233\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011234Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
11236static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238{
Jesus Ceaac451502011-04-20 17:09:23 +020011239 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011240 Py_ssize_t start;
11241 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011242 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Jesus Ceaac451502011-04-20 17:09:23 +020011244 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11245 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 if (PyUnicode_READY(self) == -1)
11249 return NULL;
11250 if (PyUnicode_READY(substring) == -1)
11251 return NULL;
11252
11253 result = any_find_slice(
11254 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11255 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011256 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
11258 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (result == -2)
11261 return NULL;
11262
Christian Heimes217cfd12007-12-02 14:31:20 +000011263 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264}
11265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011266PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011269Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
11271static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273{
Jesus Ceaac451502011-04-20 17:09:23 +020011274 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011275 Py_ssize_t start;
11276 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011277 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Jesus Ceaac451502011-04-20 17:09:23 +020011279 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11280 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (PyUnicode_READY(self) == -1)
11284 return NULL;
11285 if (PyUnicode_READY(substring) == -1)
11286 return NULL;
11287
11288 result = any_find_slice(
11289 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11290 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011291 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
11293 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (result == -2)
11296 return NULL;
11297
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 if (result < 0) {
11299 PyErr_SetString(PyExc_ValueError, "substring not found");
11300 return NULL;
11301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302
Christian Heimes217cfd12007-12-02 14:31:20 +000011303 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304}
11305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011306PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011309Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011310done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
11312static PyObject *
11313unicode_rjust(PyUnicodeObject *self, PyObject *args)
11314{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011315 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 Py_UCS4 fillchar = ' ';
11317
Victor Stinnere9a29352011-10-01 02:14:59 +020011318 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011320
Victor Stinnere9a29352011-10-01 02:14:59 +020011321 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322 return NULL;
11323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 Py_INCREF(self);
11326 return (PyObject*) self;
11327 }
11328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330}
11331
Alexander Belopolsky40018472011-02-26 01:02:56 +000011332PyObject *
11333PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334{
11335 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011336
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 s = PyUnicode_FromObject(s);
11338 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011339 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 if (sep != NULL) {
11341 sep = PyUnicode_FromObject(sep);
11342 if (sep == NULL) {
11343 Py_DECREF(s);
11344 return NULL;
11345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346 }
11347
11348 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11349
11350 Py_DECREF(s);
11351 Py_XDECREF(sep);
11352 return result;
11353}
11354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011355PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357\n\
11358Return a list of the words in S, using sep as the\n\
11359delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011360splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011361whitespace string is a separator and empty strings are\n\
11362removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
11364static PyObject*
11365unicode_split(PyUnicodeObject *self, PyObject *args)
11366{
11367 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011368 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Martin v. Löwis18e16552006-02-15 17:27:45 +000011370 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371 return NULL;
11372
11373 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011374 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379}
11380
Thomas Wouters477c8d52006-05-27 19:21:47 +000011381PyObject *
11382PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11383{
11384 PyObject* str_obj;
11385 PyObject* sep_obj;
11386 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 int kind1, kind2, kind;
11388 void *buf1 = NULL, *buf2 = NULL;
11389 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011390
11391 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011392 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011394 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011396 Py_DECREF(str_obj);
11397 return NULL;
11398 }
11399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 kind1 = PyUnicode_KIND(str_in);
11401 kind2 = PyUnicode_KIND(sep_obj);
11402 kind = kind1 > kind2 ? kind1 : kind2;
11403 buf1 = PyUnicode_DATA(str_in);
11404 if (kind1 != kind)
11405 buf1 = _PyUnicode_AsKind(str_in, kind);
11406 if (!buf1)
11407 goto onError;
11408 buf2 = PyUnicode_DATA(sep_obj);
11409 if (kind2 != kind)
11410 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11411 if (!buf2)
11412 goto onError;
11413 len1 = PyUnicode_GET_LENGTH(str_obj);
11414 len2 = PyUnicode_GET_LENGTH(sep_obj);
11415
11416 switch(PyUnicode_KIND(str_in)) {
11417 case PyUnicode_1BYTE_KIND:
11418 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11419 break;
11420 case PyUnicode_2BYTE_KIND:
11421 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11422 break;
11423 case PyUnicode_4BYTE_KIND:
11424 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11425 break;
11426 default:
11427 assert(0);
11428 out = 0;
11429 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011430
11431 Py_DECREF(sep_obj);
11432 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (kind1 != kind)
11434 PyMem_Free(buf1);
11435 if (kind2 != kind)
11436 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011437
11438 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 onError:
11440 Py_DECREF(sep_obj);
11441 Py_DECREF(str_obj);
11442 if (kind1 != kind && buf1)
11443 PyMem_Free(buf1);
11444 if (kind2 != kind && buf2)
11445 PyMem_Free(buf2);
11446 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011447}
11448
11449
11450PyObject *
11451PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11452{
11453 PyObject* str_obj;
11454 PyObject* sep_obj;
11455 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 int kind1, kind2, kind;
11457 void *buf1 = NULL, *buf2 = NULL;
11458 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011459
11460 str_obj = PyUnicode_FromObject(str_in);
11461 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011463 sep_obj = PyUnicode_FromObject(sep_in);
11464 if (!sep_obj) {
11465 Py_DECREF(str_obj);
11466 return NULL;
11467 }
11468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 kind1 = PyUnicode_KIND(str_in);
11470 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011471 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 buf1 = PyUnicode_DATA(str_in);
11473 if (kind1 != kind)
11474 buf1 = _PyUnicode_AsKind(str_in, kind);
11475 if (!buf1)
11476 goto onError;
11477 buf2 = PyUnicode_DATA(sep_obj);
11478 if (kind2 != kind)
11479 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11480 if (!buf2)
11481 goto onError;
11482 len1 = PyUnicode_GET_LENGTH(str_obj);
11483 len2 = PyUnicode_GET_LENGTH(sep_obj);
11484
11485 switch(PyUnicode_KIND(str_in)) {
11486 case PyUnicode_1BYTE_KIND:
11487 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11488 break;
11489 case PyUnicode_2BYTE_KIND:
11490 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11491 break;
11492 case PyUnicode_4BYTE_KIND:
11493 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11494 break;
11495 default:
11496 assert(0);
11497 out = 0;
11498 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011499
11500 Py_DECREF(sep_obj);
11501 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (kind1 != kind)
11503 PyMem_Free(buf1);
11504 if (kind2 != kind)
11505 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011506
11507 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 onError:
11509 Py_DECREF(sep_obj);
11510 Py_DECREF(str_obj);
11511 if (kind1 != kind && buf1)
11512 PyMem_Free(buf1);
11513 if (kind2 != kind && buf2)
11514 PyMem_Free(buf2);
11515 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011516}
11517
11518PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011520\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011521Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011522the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011523found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011524
11525static PyObject*
11526unicode_partition(PyUnicodeObject *self, PyObject *separator)
11527{
11528 return PyUnicode_Partition((PyObject *)self, separator);
11529}
11530
11531PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011532 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011533\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011534Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011535the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011536separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011537
11538static PyObject*
11539unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11540{
11541 return PyUnicode_RPartition((PyObject *)self, separator);
11542}
11543
Alexander Belopolsky40018472011-02-26 01:02:56 +000011544PyObject *
11545PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011546{
11547 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011548
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011549 s = PyUnicode_FromObject(s);
11550 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011551 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (sep != NULL) {
11553 sep = PyUnicode_FromObject(sep);
11554 if (sep == NULL) {
11555 Py_DECREF(s);
11556 return NULL;
11557 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011558 }
11559
11560 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11561
11562 Py_DECREF(s);
11563 Py_XDECREF(sep);
11564 return result;
11565}
11566
11567PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011569\n\
11570Return a list of the words in S, using sep as the\n\
11571delimiter string, starting at the end of the string and\n\
11572working to the front. If maxsplit is given, at most maxsplit\n\
11573splits are done. If sep is not specified, any whitespace string\n\
11574is a separator.");
11575
11576static PyObject*
11577unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11578{
11579 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011580 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011581
Martin v. Löwis18e16552006-02-15 17:27:45 +000011582 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011583 return NULL;
11584
11585 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011587 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011589 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011591}
11592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595\n\
11596Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011597Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
11600static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011601unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011603 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011604 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011606 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11607 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608 return NULL;
11609
Guido van Rossum86662912000-04-11 15:38:46 +000011610 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
11613static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011614PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Walter Dörwald346737f2007-05-31 10:44:43 +000011616 if (PyUnicode_CheckExact(self)) {
11617 Py_INCREF(self);
11618 return self;
11619 } else
11620 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011621 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622}
11623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011624PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626\n\
11627Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011631unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 return fixup(self, fixswapcase);
11634}
11635
Georg Brandlceee0772007-11-27 23:48:05 +000011636PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011638\n\
11639Return a translation table usable for str.translate().\n\
11640If there is only one argument, it must be a dictionary mapping Unicode\n\
11641ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011642Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011643If there are two arguments, they must be strings of equal length, and\n\
11644in the resulting dictionary, each character in x will be mapped to the\n\
11645character at the same position in y. If there is a third argument, it\n\
11646must be a string, whose characters will be mapped to None in the result.");
11647
11648static PyObject*
11649unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11650{
11651 PyObject *x, *y = NULL, *z = NULL;
11652 PyObject *new = NULL, *key, *value;
11653 Py_ssize_t i = 0;
11654 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011655
Georg Brandlceee0772007-11-27 23:48:05 +000011656 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11657 return NULL;
11658 new = PyDict_New();
11659 if (!new)
11660 return NULL;
11661 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 int x_kind, y_kind, z_kind;
11663 void *x_data, *y_data, *z_data;
11664
Georg Brandlceee0772007-11-27 23:48:05 +000011665 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011666 if (!PyUnicode_Check(x)) {
11667 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11668 "be a string if there is a second argument");
11669 goto err;
11670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011672 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11673 "arguments must have equal length");
11674 goto err;
11675 }
11676 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 x_kind = PyUnicode_KIND(x);
11678 y_kind = PyUnicode_KIND(y);
11679 x_data = PyUnicode_DATA(x);
11680 y_data = PyUnicode_DATA(y);
11681 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11682 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11683 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011684 if (!key || !value)
11685 goto err;
11686 res = PyDict_SetItem(new, key, value);
11687 Py_DECREF(key);
11688 Py_DECREF(value);
11689 if (res < 0)
11690 goto err;
11691 }
11692 /* create entries for deleting chars in z */
11693 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 z_kind = PyUnicode_KIND(z);
11695 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011696 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011698 if (!key)
11699 goto err;
11700 res = PyDict_SetItem(new, key, Py_None);
11701 Py_DECREF(key);
11702 if (res < 0)
11703 goto err;
11704 }
11705 }
11706 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 int kind;
11708 void *data;
11709
Georg Brandlceee0772007-11-27 23:48:05 +000011710 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011711 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011712 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11713 "to maketrans it must be a dict");
11714 goto err;
11715 }
11716 /* copy entries into the new dict, converting string keys to int keys */
11717 while (PyDict_Next(x, &i, &key, &value)) {
11718 if (PyUnicode_Check(key)) {
11719 /* convert string keys to integer keys */
11720 PyObject *newkey;
11721 if (PyUnicode_GET_SIZE(key) != 1) {
11722 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11723 "table must be of length 1");
11724 goto err;
11725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 kind = PyUnicode_KIND(key);
11727 data = PyUnicode_DATA(key);
11728 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011729 if (!newkey)
11730 goto err;
11731 res = PyDict_SetItem(new, newkey, value);
11732 Py_DECREF(newkey);
11733 if (res < 0)
11734 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011735 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011736 /* just keep integer keys */
11737 if (PyDict_SetItem(new, key, value) < 0)
11738 goto err;
11739 } else {
11740 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11741 "be strings or integers");
11742 goto err;
11743 }
11744 }
11745 }
11746 return new;
11747 err:
11748 Py_DECREF(new);
11749 return NULL;
11750}
11751
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011752PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754\n\
11755Return a copy of the string S, where all characters have been mapped\n\
11756through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011757Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011758Unmapped characters are left untouched. Characters mapped to None\n\
11759are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760
11761static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765}
11766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011767PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
11772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011773unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 return fixup(self, fixupper);
11776}
11777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011778PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011781Pad a numeric string S with zeros on the left, to fill a field\n\
11782of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
11784static PyObject *
11785unicode_zfill(PyUnicodeObject *self, PyObject *args)
11786{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011787 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011789 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 int kind;
11791 void *data;
11792 Py_UCS4 chr;
11793
11794 if (PyUnicode_READY(self) == -1)
11795 return NULL;
11796
Martin v. Löwis18e16552006-02-15 17:27:45 +000011797 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 return NULL;
11799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011801 if (PyUnicode_CheckExact(self)) {
11802 Py_INCREF(self);
11803 return (PyObject*) self;
11804 }
11805 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011806 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 }
11808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
11811 u = pad(self, fill, 0, '0');
11812
Walter Dörwald068325e2002-04-15 13:36:47 +000011813 if (u == NULL)
11814 return NULL;
11815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 kind = PyUnicode_KIND(u);
11817 data = PyUnicode_DATA(u);
11818 chr = PyUnicode_READ(kind, data, fill);
11819
11820 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 PyUnicode_WRITE(kind, data, 0, chr);
11823 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 }
11825
11826 return (PyObject*) u;
11827}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
11829#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011830static PyObject *
11831unicode__decimal2ascii(PyObject *self)
11832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011834}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835#endif
11836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011837PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011840Return True if S starts with the specified prefix, False otherwise.\n\
11841With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011842With optional end, stop comparing S at that position.\n\
11843prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
11845static PyObject *
11846unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011847 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011849 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011851 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011852 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011853 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854
Jesus Ceaac451502011-04-20 17:09:23 +020011855 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011857 if (PyTuple_Check(subobj)) {
11858 Py_ssize_t i;
11859 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11860 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011862 if (substring == NULL)
11863 return NULL;
11864 result = tailmatch(self, substring, start, end, -1);
11865 Py_DECREF(substring);
11866 if (result) {
11867 Py_RETURN_TRUE;
11868 }
11869 }
11870 /* nothing matched */
11871 Py_RETURN_FALSE;
11872 }
11873 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011874 if (substring == NULL) {
11875 if (PyErr_ExceptionMatches(PyExc_TypeError))
11876 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11877 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011879 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011880 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011882 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883}
11884
11885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011887 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011889Return True if S ends with the specified suffix, False otherwise.\n\
11890With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011891With optional end, stop comparing S at that position.\n\
11892suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
11894static PyObject *
11895unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011898 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011900 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011901 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011902 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
Jesus Ceaac451502011-04-20 17:09:23 +020011904 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011906 if (PyTuple_Check(subobj)) {
11907 Py_ssize_t i;
11908 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11909 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011911 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011913 result = tailmatch(self, substring, start, end, +1);
11914 Py_DECREF(substring);
11915 if (result) {
11916 Py_RETURN_TRUE;
11917 }
11918 }
11919 Py_RETURN_FALSE;
11920 }
11921 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011922 if (substring == NULL) {
11923 if (PyErr_ExceptionMatches(PyExc_TypeError))
11924 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11925 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011927 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011928 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011930 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931}
11932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011934
11935PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011937\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011938Return a formatted version of S, using substitutions from args and kwargs.\n\
11939The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011940
Eric Smith27bbca62010-11-04 17:06:58 +000011941PyDoc_STRVAR(format_map__doc__,
11942 "S.format_map(mapping) -> str\n\
11943\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011944Return a formatted version of S, using substitutions from mapping.\n\
11945The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011946
Eric Smith4a7d76d2008-05-30 18:10:19 +000011947static PyObject *
11948unicode__format__(PyObject* self, PyObject* args)
11949{
11950 PyObject *format_spec;
11951
11952 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11953 return NULL;
11954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11956 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011957}
11958
Eric Smith8c663262007-08-25 02:26:07 +000011959PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011961\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011962Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011963
11964static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011965unicode__sizeof__(PyUnicodeObject *v)
11966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 Py_ssize_t size;
11968
11969 /* If it's a compact object, account for base structure +
11970 character data. */
11971 if (PyUnicode_IS_COMPACT_ASCII(v))
11972 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11973 else if (PyUnicode_IS_COMPACT(v))
11974 size = sizeof(PyCompactUnicodeObject) +
11975 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11976 else {
11977 /* If it is a two-block object, account for base object, and
11978 for character block if present. */
11979 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011980 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 size += (PyUnicode_GET_LENGTH(v) + 1) *
11982 PyUnicode_CHARACTER_SIZE(v);
11983 }
11984 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011985 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011987 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011989 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011990 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991
11992 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011993}
11994
11995PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011997
11998static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011999unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012000{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012001 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (!copy)
12003 return NULL;
12004 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012005}
12006
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007static PyMethodDef unicode_methods[] = {
12008
12009 /* Order is according to common usage: often used methods should
12010 appear first, since lookup is done sequentially. */
12011
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012012 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012013 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12014 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012015 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012016 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12017 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12018 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12019 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12020 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12021 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12022 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012023 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012024 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12025 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12026 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012027 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012028 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12029 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12030 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012031 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012032 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012033 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012034 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012035 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12036 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12037 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12038 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12039 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12040 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12041 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12042 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12043 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12044 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12045 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12046 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12047 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12048 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012049 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012050 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012051 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012052 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012053 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012054 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012055 {"maketrans", (PyCFunction) unicode_maketrans,
12056 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012057 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012058#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012059 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060#endif
12061
12062#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012063 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012064 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065#endif
12066
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068 {NULL, NULL}
12069};
12070
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012071static PyObject *
12072unicode_mod(PyObject *v, PyObject *w)
12073{
Brian Curtindfc80e32011-08-10 20:28:54 -050012074 if (!PyUnicode_Check(v))
12075 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012077}
12078
12079static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 0, /*nb_add*/
12081 0, /*nb_subtract*/
12082 0, /*nb_multiply*/
12083 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012084};
12085
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012087 (lenfunc) unicode_length, /* sq_length */
12088 PyUnicode_Concat, /* sq_concat */
12089 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12090 (ssizeargfunc) unicode_getitem, /* sq_item */
12091 0, /* sq_slice */
12092 0, /* sq_ass_item */
12093 0, /* sq_ass_slice */
12094 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095};
12096
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012097static PyObject*
12098unicode_subscript(PyUnicodeObject* self, PyObject* item)
12099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (PyUnicode_READY(self) == -1)
12101 return NULL;
12102
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012103 if (PyIndex_Check(item)) {
12104 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012105 if (i == -1 && PyErr_Occurred())
12106 return NULL;
12107 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012109 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012110 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012111 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012113 Py_UNICODE* result_buf;
12114 PyObject* result;
12115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012118 return NULL;
12119 }
12120
12121 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 return PyUnicode_New(0, 0);
12123 } else if (start == 0 && step == 1 &&
12124 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012125 PyUnicode_CheckExact(self)) {
12126 Py_INCREF(self);
12127 return (PyObject *)self;
12128 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012129 return PyUnicode_Substring((PyObject*)self,
12130 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012131 } else {
12132 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012133 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12134 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012135
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 if (result_buf == NULL)
12137 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012138
12139 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12140 result_buf[i] = source_buf[cur];
12141 }
Tim Petersced69f82003-09-16 20:30:58 +000012142
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012143 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012144 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012145 return result;
12146 }
12147 } else {
12148 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12149 return NULL;
12150 }
12151}
12152
12153static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012154 (lenfunc)unicode_length, /* mp_length */
12155 (binaryfunc)unicode_subscript, /* mp_subscript */
12156 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012157};
12158
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160/* Helpers for PyUnicode_Format() */
12161
12162static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012163getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012165 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012167 (*p_argidx)++;
12168 if (arglen < 0)
12169 return args;
12170 else
12171 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172 }
12173 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012174 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 return NULL;
12176}
12177
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012178/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012180static PyObject *
12181formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012183 char *p;
12184 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012186
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 x = PyFloat_AsDouble(v);
12188 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012189 return NULL;
12190
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012193
Eric Smith0923d1d2009-04-16 20:16:10 +000012194 p = PyOS_double_to_string(x, type, prec,
12195 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012196 if (p == NULL)
12197 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012199 PyMem_Free(p);
12200 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201}
12202
Tim Peters38fd5b62000-09-21 05:43:11 +000012203static PyObject*
12204formatlong(PyObject *val, int flags, int prec, int type)
12205{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012206 char *buf;
12207 int len;
12208 PyObject *str; /* temporary string object. */
12209 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012210
Benjamin Peterson14339b62009-01-31 16:36:08 +000012211 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12212 if (!str)
12213 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012215 Py_DECREF(str);
12216 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012217}
12218
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012221 size_t buflen,
12222 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012224 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012225 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 if (PyUnicode_GET_LENGTH(v) == 1) {
12227 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 buf[1] = '\0';
12229 return 1;
12230 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 goto onError;
12232 }
12233 else {
12234 /* Integer input truncated to a character */
12235 long x;
12236 x = PyLong_AsLong(v);
12237 if (x == -1 && PyErr_Occurred())
12238 goto onError;
12239
12240 if (x < 0 || x > 0x10ffff) {
12241 PyErr_SetString(PyExc_OverflowError,
12242 "%c arg not in range(0x110000)");
12243 return -1;
12244 }
12245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012247 buf[1] = '\0';
12248 return 1;
12249 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012250
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012252 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012254 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255}
12256
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012257/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012258 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012259*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012260#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012261
Alexander Belopolsky40018472011-02-26 01:02:56 +000012262PyObject *
12263PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 void *fmt;
12266 int fmtkind;
12267 PyObject *result;
12268 Py_UCS4 *res, *res0;
12269 Py_UCS4 max;
12270 int kind;
12271 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012275
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 PyErr_BadInternalCall();
12278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12281 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 fmt = PyUnicode_DATA(uformat);
12284 fmtkind = PyUnicode_KIND(uformat);
12285 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12286 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
12288 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12290 if (res0 == NULL) {
12291 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294
12295 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 arglen = PyTuple_Size(args);
12297 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298 }
12299 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 arglen = -1;
12301 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012303 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012304 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306
12307 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 if (--rescnt < 0) {
12310 rescnt = fmtcnt + 100;
12311 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12313 if (res0 == NULL){
12314 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 }
12317 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 }
12322 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 /* Got a format specifier */
12324 int flags = 0;
12325 Py_ssize_t width = -1;
12326 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 Py_UCS4 c = '\0';
12328 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 int isnumok;
12330 PyObject *v = NULL;
12331 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 void *pbuf;
12333 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 Py_ssize_t len, len1;
12336 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 fmtpos++;
12339 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12340 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 Py_ssize_t keylen;
12342 PyObject *key;
12343 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012344
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 if (dict == NULL) {
12346 PyErr_SetString(PyExc_TypeError,
12347 "format requires a mapping");
12348 goto onError;
12349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 /* Skip over balanced parentheses */
12354 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 if (fmtcnt < 0 || pcount > 0) {
12363 PyErr_SetString(PyExc_ValueError,
12364 "incomplete format key");
12365 goto onError;
12366 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012367 key = PyUnicode_Substring((PyObject*)uformat,
12368 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 if (key == NULL)
12370 goto onError;
12371 if (args_owned) {
12372 Py_DECREF(args);
12373 args_owned = 0;
12374 }
12375 args = PyObject_GetItem(dict, key);
12376 Py_DECREF(key);
12377 if (args == NULL) {
12378 goto onError;
12379 }
12380 args_owned = 1;
12381 arglen = -1;
12382 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 case '-': flags |= F_LJUST; continue;
12387 case '+': flags |= F_SIGN; continue;
12388 case ' ': flags |= F_BLANK; continue;
12389 case '#': flags |= F_ALT; continue;
12390 case '0': flags |= F_ZERO; continue;
12391 }
12392 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012393 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 if (c == '*') {
12395 v = getnextarg(args, arglen, &argidx);
12396 if (v == NULL)
12397 goto onError;
12398 if (!PyLong_Check(v)) {
12399 PyErr_SetString(PyExc_TypeError,
12400 "* wants int");
12401 goto onError;
12402 }
12403 width = PyLong_AsLong(v);
12404 if (width == -1 && PyErr_Occurred())
12405 goto onError;
12406 if (width < 0) {
12407 flags |= F_LJUST;
12408 width = -width;
12409 }
12410 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 }
12413 else if (c >= '0' && c <= '9') {
12414 width = c - '0';
12415 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 if (c < '0' || c > '9')
12418 break;
12419 if ((width*10) / 10 != width) {
12420 PyErr_SetString(PyExc_ValueError,
12421 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 }
12424 width = width*10 + (c - '0');
12425 }
12426 }
12427 if (c == '.') {
12428 prec = 0;
12429 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 if (c == '*') {
12432 v = getnextarg(args, arglen, &argidx);
12433 if (v == NULL)
12434 goto onError;
12435 if (!PyLong_Check(v)) {
12436 PyErr_SetString(PyExc_TypeError,
12437 "* wants int");
12438 goto onError;
12439 }
12440 prec = PyLong_AsLong(v);
12441 if (prec == -1 && PyErr_Occurred())
12442 goto onError;
12443 if (prec < 0)
12444 prec = 0;
12445 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 }
12448 else if (c >= '0' && c <= '9') {
12449 prec = c - '0';
12450 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 if (c < '0' || c > '9')
12453 break;
12454 if ((prec*10) / 10 != prec) {
12455 PyErr_SetString(PyExc_ValueError,
12456 "prec too big");
12457 goto onError;
12458 }
12459 prec = prec*10 + (c - '0');
12460 }
12461 }
12462 } /* prec */
12463 if (fmtcnt >= 0) {
12464 if (c == 'h' || c == 'l' || c == 'L') {
12465 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012467 }
12468 }
12469 if (fmtcnt < 0) {
12470 PyErr_SetString(PyExc_ValueError,
12471 "incomplete format");
12472 goto onError;
12473 }
12474 if (c != '%') {
12475 v = getnextarg(args, arglen, &argidx);
12476 if (v == NULL)
12477 goto onError;
12478 }
12479 sign = 0;
12480 fill = ' ';
12481 switch (c) {
12482
12483 case '%':
12484 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 len = 1;
12489 break;
12490
12491 case 's':
12492 case 'r':
12493 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012494 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 temp = v;
12496 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012497 }
12498 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 if (c == 's')
12500 temp = PyObject_Str(v);
12501 else if (c == 'r')
12502 temp = PyObject_Repr(v);
12503 else
12504 temp = PyObject_ASCII(v);
12505 if (temp == NULL)
12506 goto onError;
12507 if (PyUnicode_Check(temp))
12508 /* nothing to do */;
12509 else {
12510 Py_DECREF(temp);
12511 PyErr_SetString(PyExc_TypeError,
12512 "%s argument has non-string str()");
12513 goto onError;
12514 }
12515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 if (PyUnicode_READY(temp) == -1) {
12517 Py_CLEAR(temp);
12518 goto onError;
12519 }
12520 pbuf = PyUnicode_DATA(temp);
12521 kind = PyUnicode_KIND(temp);
12522 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 if (prec >= 0 && len > prec)
12524 len = prec;
12525 break;
12526
12527 case 'i':
12528 case 'd':
12529 case 'u':
12530 case 'o':
12531 case 'x':
12532 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 isnumok = 0;
12534 if (PyNumber_Check(v)) {
12535 PyObject *iobj=NULL;
12536
12537 if (PyLong_Check(v)) {
12538 iobj = v;
12539 Py_INCREF(iobj);
12540 }
12541 else {
12542 iobj = PyNumber_Long(v);
12543 }
12544 if (iobj!=NULL) {
12545 if (PyLong_Check(iobj)) {
12546 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012547 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 Py_DECREF(iobj);
12549 if (!temp)
12550 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 if (PyUnicode_READY(temp) == -1) {
12552 Py_CLEAR(temp);
12553 goto onError;
12554 }
12555 pbuf = PyUnicode_DATA(temp);
12556 kind = PyUnicode_KIND(temp);
12557 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 sign = 1;
12559 }
12560 else {
12561 Py_DECREF(iobj);
12562 }
12563 }
12564 }
12565 if (!isnumok) {
12566 PyErr_Format(PyExc_TypeError,
12567 "%%%c format: a number is required, "
12568 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12569 goto onError;
12570 }
12571 if (flags & F_ZERO)
12572 fill = '0';
12573 break;
12574
12575 case 'e':
12576 case 'E':
12577 case 'f':
12578 case 'F':
12579 case 'g':
12580 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012581 temp = formatfloat(v, flags, prec, c);
12582 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 if (PyUnicode_READY(temp) == -1) {
12585 Py_CLEAR(temp);
12586 goto onError;
12587 }
12588 pbuf = PyUnicode_DATA(temp);
12589 kind = PyUnicode_KIND(temp);
12590 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 sign = 1;
12592 if (flags & F_ZERO)
12593 fill = '0';
12594 break;
12595
12596 case 'c':
12597 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012599 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 if (len < 0)
12601 goto onError;
12602 break;
12603
12604 default:
12605 PyErr_Format(PyExc_ValueError,
12606 "unsupported format character '%c' (0x%x) "
12607 "at index %zd",
12608 (31<=c && c<=126) ? (char)c : '?',
12609 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 goto onError;
12612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 /* pbuf is initialized here. */
12614 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12617 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12618 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 len--;
12620 }
12621 else if (flags & F_SIGN)
12622 sign = '+';
12623 else if (flags & F_BLANK)
12624 sign = ' ';
12625 else
12626 sign = 0;
12627 }
12628 if (width < len)
12629 width = len;
12630 if (rescnt - (sign != 0) < width) {
12631 reslen -= rescnt;
12632 rescnt = width + fmtcnt + 100;
12633 reslen += rescnt;
12634 if (reslen < 0) {
12635 Py_XDECREF(temp);
12636 PyErr_NoMemory();
12637 goto onError;
12638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12640 if (res0 == 0) {
12641 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 Py_XDECREF(temp);
12643 goto onError;
12644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 }
12647 if (sign) {
12648 if (fill != ' ')
12649 *res++ = sign;
12650 rescnt--;
12651 if (width > len)
12652 width--;
12653 }
12654 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12656 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12659 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 }
12661 rescnt -= 2;
12662 width -= 2;
12663 if (width < 0)
12664 width = 0;
12665 len -= 2;
12666 }
12667 if (width > len && !(flags & F_LJUST)) {
12668 do {
12669 --rescnt;
12670 *res++ = fill;
12671 } while (--width > len);
12672 }
12673 if (fill == ' ') {
12674 if (sign)
12675 *res++ = sign;
12676 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12678 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12679 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12680 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012681 }
12682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 /* Copy all characters, preserving len */
12684 len1 = len;
12685 while (len1--) {
12686 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12687 rescnt--;
12688 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 while (--width >= len) {
12690 --rescnt;
12691 *res++ = ' ';
12692 }
12693 if (dict && (argidx < arglen) && c != '%') {
12694 PyErr_SetString(PyExc_TypeError,
12695 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012696 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 goto onError;
12698 }
12699 Py_XDECREF(temp);
12700 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701 } /* until end */
12702 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 PyErr_SetString(PyExc_TypeError,
12704 "not all arguments converted during string formatting");
12705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706 }
12707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708
12709 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12710 if (*res > max)
12711 max = *res;
12712 result = PyUnicode_New(reslen - rescnt, max);
12713 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 kind = PyUnicode_KIND(result);
12716 for (res = res0; res < res0+reslen-rescnt; res++)
12717 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12718 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721 }
12722 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723 return (PyObject *)result;
12724
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727 Py_DECREF(uformat);
12728 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 }
12731 return NULL;
12732}
12733
Jeremy Hylton938ace62002-07-17 16:30:39 +000012734static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012735unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12736
Tim Peters6d6c1a32001-08-02 04:15:00 +000012737static PyObject *
12738unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12739{
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 static char *kwlist[] = {"object", "encoding", "errors", 0};
12742 char *encoding = NULL;
12743 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012744
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 if (type != &PyUnicode_Type)
12746 return unicode_subtype_new(type, args, kwds);
12747 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012749 return NULL;
12750 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 if (encoding == NULL && errors == NULL)
12753 return PyObject_Str(x);
12754 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012756}
12757
Guido van Rossume023fe02001-08-30 03:12:59 +000012758static PyObject *
12759unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12760{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012761 PyUnicodeObject *unicode, *self;
12762 Py_ssize_t length, char_size;
12763 int share_wstr, share_utf8;
12764 unsigned int kind;
12765 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012766
Benjamin Peterson14339b62009-01-31 16:36:08 +000012767 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012768
12769 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12770 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012772 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012773 if (PyUnicode_READY(unicode))
12774 return NULL;
12775
12776 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12777 if (self == NULL) {
12778 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012779 return NULL;
12780 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012781 kind = PyUnicode_KIND(unicode);
12782 length = PyUnicode_GET_LENGTH(unicode);
12783
12784 _PyUnicode_LENGTH(self) = length;
12785 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12786 _PyUnicode_STATE(self).interned = 0;
12787 _PyUnicode_STATE(self).kind = kind;
12788 _PyUnicode_STATE(self).compact = 0;
12789 _PyUnicode_STATE(self).ascii = 0;
12790 _PyUnicode_STATE(self).ready = 1;
12791 _PyUnicode_WSTR(self) = NULL;
12792 _PyUnicode_UTF8_LENGTH(self) = 0;
12793 _PyUnicode_UTF8(self) = NULL;
12794 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012795 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012796
12797 share_utf8 = 0;
12798 share_wstr = 0;
12799 if (kind == PyUnicode_1BYTE_KIND) {
12800 char_size = 1;
12801 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12802 share_utf8 = 1;
12803 }
12804 else if (kind == PyUnicode_2BYTE_KIND) {
12805 char_size = 2;
12806 if (sizeof(wchar_t) == 2)
12807 share_wstr = 1;
12808 }
12809 else {
12810 assert(kind == PyUnicode_4BYTE_KIND);
12811 char_size = 4;
12812 if (sizeof(wchar_t) == 4)
12813 share_wstr = 1;
12814 }
12815
12816 /* Ensure we won't overflow the length. */
12817 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12818 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012821 data = PyObject_MALLOC((length + 1) * char_size);
12822 if (data == NULL) {
12823 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824 goto onError;
12825 }
12826
Victor Stinnerc3c74152011-10-02 20:39:55 +020012827 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012828 if (share_utf8) {
12829 _PyUnicode_UTF8_LENGTH(self) = length;
12830 _PyUnicode_UTF8(self) = data;
12831 }
12832 if (share_wstr) {
12833 _PyUnicode_WSTR_LENGTH(self) = length;
12834 _PyUnicode_WSTR(self) = (wchar_t *)data;
12835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012836
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012837 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12838 PyUnicode_KIND_SIZE(kind, length + 1));
12839 Py_DECREF(unicode);
12840 return (PyObject *)self;
12841
12842onError:
12843 Py_DECREF(unicode);
12844 Py_DECREF(self);
12845 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012846}
12847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012848PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012850\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012851Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012852encoding defaults to the current default string encoding.\n\
12853errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012854
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012855static PyObject *unicode_iter(PyObject *seq);
12856
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012858 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012859 "str", /* tp_name */
12860 sizeof(PyUnicodeObject), /* tp_size */
12861 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012863 (destructor)unicode_dealloc, /* tp_dealloc */
12864 0, /* tp_print */
12865 0, /* tp_getattr */
12866 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012867 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012868 unicode_repr, /* tp_repr */
12869 &unicode_as_number, /* tp_as_number */
12870 &unicode_as_sequence, /* tp_as_sequence */
12871 &unicode_as_mapping, /* tp_as_mapping */
12872 (hashfunc) unicode_hash, /* tp_hash*/
12873 0, /* tp_call*/
12874 (reprfunc) unicode_str, /* tp_str */
12875 PyObject_GenericGetAttr, /* tp_getattro */
12876 0, /* tp_setattro */
12877 0, /* tp_as_buffer */
12878 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012880 unicode_doc, /* tp_doc */
12881 0, /* tp_traverse */
12882 0, /* tp_clear */
12883 PyUnicode_RichCompare, /* tp_richcompare */
12884 0, /* tp_weaklistoffset */
12885 unicode_iter, /* tp_iter */
12886 0, /* tp_iternext */
12887 unicode_methods, /* tp_methods */
12888 0, /* tp_members */
12889 0, /* tp_getset */
12890 &PyBaseObject_Type, /* tp_base */
12891 0, /* tp_dict */
12892 0, /* tp_descr_get */
12893 0, /* tp_descr_set */
12894 0, /* tp_dictoffset */
12895 0, /* tp_init */
12896 0, /* tp_alloc */
12897 unicode_new, /* tp_new */
12898 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899};
12900
12901/* Initialize the Unicode implementation */
12902
Thomas Wouters78890102000-07-22 19:25:51 +000012903void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012905 int i;
12906
Thomas Wouters477c8d52006-05-27 19:21:47 +000012907 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909 0x000A, /* LINE FEED */
12910 0x000D, /* CARRIAGE RETURN */
12911 0x001C, /* FILE SEPARATOR */
12912 0x001D, /* GROUP SEPARATOR */
12913 0x001E, /* RECORD SEPARATOR */
12914 0x0085, /* NEXT LINE */
12915 0x2028, /* LINE SEPARATOR */
12916 0x2029, /* PARAGRAPH SEPARATOR */
12917 };
12918
Fred Drakee4315f52000-05-09 19:53:39 +000012919 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012920 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012921 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012923
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012924 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012926 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928
12929 /* initialize the linebreak bloom filter */
12930 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012932 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012933
12934 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935}
12936
12937/* Finalize the Unicode implementation */
12938
Christian Heimesa156e092008-02-16 07:38:31 +000012939int
12940PyUnicode_ClearFreeList(void)
12941{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012943}
12944
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945void
Thomas Wouters78890102000-07-22 19:25:51 +000012946_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012948 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012950 Py_XDECREF(unicode_empty);
12951 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012952
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012953 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 if (unicode_latin1[i]) {
12955 Py_DECREF(unicode_latin1[i]);
12956 unicode_latin1[i] = NULL;
12957 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012958 }
Christian Heimesa156e092008-02-16 07:38:31 +000012959 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012961
Walter Dörwald16807132007-05-25 13:52:07 +000012962void
12963PyUnicode_InternInPlace(PyObject **p)
12964{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012965 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12966 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012967#ifdef Py_DEBUG
12968 assert(s != NULL);
12969 assert(_PyUnicode_CHECK(s));
12970#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012971 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012972 return;
12973#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012974 /* If it's a subclass, we don't really know what putting
12975 it in the interned dict might do. */
12976 if (!PyUnicode_CheckExact(s))
12977 return;
12978 if (PyUnicode_CHECK_INTERNED(s))
12979 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012981 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 return;
12983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012984 if (interned == NULL) {
12985 interned = PyDict_New();
12986 if (interned == NULL) {
12987 PyErr_Clear(); /* Don't leave an exception */
12988 return;
12989 }
12990 }
12991 /* It might be that the GetItem call fails even
12992 though the key is present in the dictionary,
12993 namely when this happens during a stack overflow. */
12994 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012996 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012997
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 if (t) {
12999 Py_INCREF(t);
13000 Py_DECREF(*p);
13001 *p = t;
13002 return;
13003 }
Walter Dörwald16807132007-05-25 13:52:07 +000013004
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 PyThreadState_GET()->recursion_critical = 1;
13006 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13007 PyErr_Clear();
13008 PyThreadState_GET()->recursion_critical = 0;
13009 return;
13010 }
13011 PyThreadState_GET()->recursion_critical = 0;
13012 /* The two references in interned are not counted by refcnt.
13013 The deallocator will take care of this */
13014 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013016}
13017
13018void
13019PyUnicode_InternImmortal(PyObject **p)
13020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13022
Benjamin Peterson14339b62009-01-31 16:36:08 +000013023 PyUnicode_InternInPlace(p);
13024 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013026 Py_INCREF(*p);
13027 }
Walter Dörwald16807132007-05-25 13:52:07 +000013028}
13029
13030PyObject *
13031PyUnicode_InternFromString(const char *cp)
13032{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033 PyObject *s = PyUnicode_FromString(cp);
13034 if (s == NULL)
13035 return NULL;
13036 PyUnicode_InternInPlace(&s);
13037 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013038}
13039
Alexander Belopolsky40018472011-02-26 01:02:56 +000013040void
13041_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013042{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013043 PyObject *keys;
13044 PyUnicodeObject *s;
13045 Py_ssize_t i, n;
13046 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013047
Benjamin Peterson14339b62009-01-31 16:36:08 +000013048 if (interned == NULL || !PyDict_Check(interned))
13049 return;
13050 keys = PyDict_Keys(interned);
13051 if (keys == NULL || !PyList_Check(keys)) {
13052 PyErr_Clear();
13053 return;
13054 }
Walter Dörwald16807132007-05-25 13:52:07 +000013055
Benjamin Peterson14339b62009-01-31 16:36:08 +000013056 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13057 detector, interned unicode strings are not forcibly deallocated;
13058 rather, we give them their stolen references back, and then clear
13059 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013060
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 n = PyList_GET_SIZE(keys);
13062 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013063 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013064 for (i = 0; i < n; i++) {
13065 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 if (PyUnicode_READY(s) == -1)
13067 fprintf(stderr, "could not ready string\n");
13068 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 case SSTATE_NOT_INTERNED:
13070 /* XXX Shouldn't happen */
13071 break;
13072 case SSTATE_INTERNED_IMMORTAL:
13073 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013075 break;
13076 case SSTATE_INTERNED_MORTAL:
13077 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013079 break;
13080 default:
13081 Py_FatalError("Inconsistent interned string state.");
13082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013084 }
13085 fprintf(stderr, "total size of all interned strings: "
13086 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13087 "mortal/immortal\n", mortal_size, immortal_size);
13088 Py_DECREF(keys);
13089 PyDict_Clear(interned);
13090 Py_DECREF(interned);
13091 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013092}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013093
13094
13095/********************* Unicode Iterator **************************/
13096
13097typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013098 PyObject_HEAD
13099 Py_ssize_t it_index;
13100 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013101} unicodeiterobject;
13102
13103static void
13104unicodeiter_dealloc(unicodeiterobject *it)
13105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013106 _PyObject_GC_UNTRACK(it);
13107 Py_XDECREF(it->it_seq);
13108 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013109}
13110
13111static int
13112unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13113{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 Py_VISIT(it->it_seq);
13115 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013116}
13117
13118static PyObject *
13119unicodeiter_next(unicodeiterobject *it)
13120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013121 PyUnicodeObject *seq;
13122 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013123
Benjamin Peterson14339b62009-01-31 16:36:08 +000013124 assert(it != NULL);
13125 seq = it->it_seq;
13126 if (seq == NULL)
13127 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013128 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13131 int kind = PyUnicode_KIND(seq);
13132 void *data = PyUnicode_DATA(seq);
13133 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13134 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 if (item != NULL)
13136 ++it->it_index;
13137 return item;
13138 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013139
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 Py_DECREF(seq);
13141 it->it_seq = NULL;
13142 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013143}
13144
13145static PyObject *
13146unicodeiter_len(unicodeiterobject *it)
13147{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 Py_ssize_t len = 0;
13149 if (it->it_seq)
13150 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13151 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013152}
13153
13154PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13155
13156static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013160};
13161
13162PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13164 "str_iterator", /* tp_name */
13165 sizeof(unicodeiterobject), /* tp_basicsize */
13166 0, /* tp_itemsize */
13167 /* methods */
13168 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13169 0, /* tp_print */
13170 0, /* tp_getattr */
13171 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013172 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 0, /* tp_repr */
13174 0, /* tp_as_number */
13175 0, /* tp_as_sequence */
13176 0, /* tp_as_mapping */
13177 0, /* tp_hash */
13178 0, /* tp_call */
13179 0, /* tp_str */
13180 PyObject_GenericGetAttr, /* tp_getattro */
13181 0, /* tp_setattro */
13182 0, /* tp_as_buffer */
13183 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13184 0, /* tp_doc */
13185 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13186 0, /* tp_clear */
13187 0, /* tp_richcompare */
13188 0, /* tp_weaklistoffset */
13189 PyObject_SelfIter, /* tp_iter */
13190 (iternextfunc)unicodeiter_next, /* tp_iternext */
13191 unicodeiter_methods, /* tp_methods */
13192 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013193};
13194
13195static PyObject *
13196unicode_iter(PyObject *seq)
13197{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013199
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 if (!PyUnicode_Check(seq)) {
13201 PyErr_BadInternalCall();
13202 return NULL;
13203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 if (PyUnicode_READY(seq) == -1)
13205 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013206 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13207 if (it == NULL)
13208 return NULL;
13209 it->it_index = 0;
13210 Py_INCREF(seq);
13211 it->it_seq = (PyUnicodeObject *)seq;
13212 _PyObject_GC_TRACK(it);
13213 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013214}
13215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216#define UNIOP(x) Py_UNICODE_##x
13217#define UNIOP_t Py_UNICODE
13218#include "uniops.h"
13219#undef UNIOP
13220#undef UNIOP_t
13221#define UNIOP(x) Py_UCS4_##x
13222#define UNIOP_t Py_UCS4
13223#include "uniops.h"
13224#undef UNIOP
13225#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013226
Victor Stinner71133ff2010-09-01 23:43:53 +000013227Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013228PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013229{
13230 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13231 Py_UNICODE *copy;
13232 Py_ssize_t size;
13233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 if (!PyUnicode_Check(unicode)) {
13235 PyErr_BadArgument();
13236 return NULL;
13237 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013238 /* Ensure we won't overflow the size. */
13239 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13240 PyErr_NoMemory();
13241 return NULL;
13242 }
13243 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13244 size *= sizeof(Py_UNICODE);
13245 copy = PyMem_Malloc(size);
13246 if (copy == NULL) {
13247 PyErr_NoMemory();
13248 return NULL;
13249 }
13250 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13251 return copy;
13252}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013253
Georg Brandl66c221e2010-10-14 07:04:07 +000013254/* A _string module, to export formatter_parser and formatter_field_name_split
13255 to the string.Formatter class implemented in Python. */
13256
13257static PyMethodDef _string_methods[] = {
13258 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13259 METH_O, PyDoc_STR("split the argument as a field name")},
13260 {"formatter_parser", (PyCFunction) formatter_parser,
13261 METH_O, PyDoc_STR("parse the argument as a format string")},
13262 {NULL, NULL}
13263};
13264
13265static struct PyModuleDef _string_module = {
13266 PyModuleDef_HEAD_INIT,
13267 "_string",
13268 PyDoc_STR("string helper module"),
13269 0,
13270 _string_methods,
13271 NULL,
13272 NULL,
13273 NULL,
13274 NULL
13275};
13276
13277PyMODINIT_FUNC
13278PyInit__string(void)
13279{
13280 return PyModule_Create(&_string_module);
13281}
13282
13283
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013284#ifdef __cplusplus
13285}
13286#endif