blob: 6e4ef3d413bfdd84dcf443654ca9bc64040fad10 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200446 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200453 share_utf8 = (_PyUnicode_UTF8(unicode) == data);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200454
455 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
456 PyErr_NoMemory();
457 return -1;
458 }
459 new_size = (length + 1) * char_size;
460
461 data = (PyObject *)PyObject_REALLOC(data, new_size);
462 if (data == NULL) {
463 PyErr_NoMemory();
464 return -1;
465 }
466 _PyUnicode_DATA_ANY(unicode) = data;
467 if (share_wstr)
468 _PyUnicode_WSTR(unicode) = data;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200469 if (share_utf8)
470 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200471 _PyUnicode_LENGTH(unicode) = length;
472 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
473 if (share_wstr)
474 return 0;
475 }
476 if (_PyUnicode_WSTR(unicode) != NULL) {
477 assert(_PyUnicode_WSTR(unicode) != NULL);
478
479 oldstr = _PyUnicode_WSTR(unicode);
480 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
481 sizeof(Py_UNICODE) * (length + 1));
482 if (!_PyUnicode_WSTR(unicode)) {
483 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
484 PyErr_NoMemory();
485 return -1;
486 }
487 _PyUnicode_WSTR(unicode)[length] = 0;
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490 return 0;
491}
492
Victor Stinnerfe226c02011-10-03 03:52:20 +0200493static PyObject*
494resize_copy(PyObject *unicode, Py_ssize_t length)
495{
496 Py_ssize_t copy_length;
497 if (PyUnicode_IS_COMPACT(unicode)) {
498 PyObject *copy;
499 assert(PyUnicode_IS_READY(unicode));
500
501 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
502 if (copy == NULL)
503 return NULL;
504
505 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
506 if (PyUnicode_CopyCharacters(copy, 0,
507 unicode, 0,
508 copy_length) < 0)
509 {
510 Py_DECREF(copy);
511 return NULL;
512 }
513 return copy;
514 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200515 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200516 assert(_PyUnicode_WSTR(unicode) != NULL);
517 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200518 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200519 if (w == NULL)
520 return NULL;
521 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
522 copy_length = Py_MIN(copy_length, length);
523 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
524 copy_length);
525 return (PyObject*)w;
526 }
527}
528
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000530 Ux0000 terminated; some code (e.g. new_identifier)
531 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532
533 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535
536*/
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#ifdef Py_DEBUG
539int unicode_old_new_calls = 0;
540#endif
541
Alexander Belopolsky40018472011-02-26 01:02:56 +0000542static PyUnicodeObject *
543_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544{
545 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 if (length == 0 && unicode_empty != NULL) {
550 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200551 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552 }
553
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000554 /* Ensure we won't overflow the size. */
555 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
556 return (PyUnicodeObject *)PyErr_NoMemory();
557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558 if (length < 0) {
559 PyErr_SetString(PyExc_SystemError,
560 "Negative size passed to _PyUnicode_New");
561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 }
563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564#ifdef Py_DEBUG
565 ++unicode_old_new_calls;
566#endif
567
568 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
569 if (unicode == NULL)
570 return NULL;
571 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
572 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
573 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000574 PyErr_NoMemory();
575 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577
Jeremy Hyltond8082792003-09-16 19:41:39 +0000578 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000579 * the caller fails before initializing str -- unicode_resize()
580 * reads str[0], and the Keep-Alive optimization can keep memory
581 * allocated for str alive across a call to unicode_dealloc(unicode).
582 * We don't want unicode_resize to read uninitialized memory in
583 * that case.
584 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200585 _PyUnicode_WSTR(unicode)[0] = 0;
586 _PyUnicode_WSTR(unicode)[length] = 0;
587 _PyUnicode_WSTR_LENGTH(unicode) = length;
588 _PyUnicode_HASH(unicode) = -1;
589 _PyUnicode_STATE(unicode).interned = 0;
590 _PyUnicode_STATE(unicode).kind = 0;
591 _PyUnicode_STATE(unicode).compact = 0;
592 _PyUnicode_STATE(unicode).ready = 0;
593 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200594 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200595 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200596 _PyUnicode_UTF8(unicode) = NULL;
597 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000599
Benjamin Peterson29060642009-01-31 22:14:21 +0000600 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000601 /* XXX UNREF/NEWREF interface should be more symmetrical */
602 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000603 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000604 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606}
607
Victor Stinnerf42dc442011-10-02 23:33:16 +0200608static const char*
609unicode_kind_name(PyObject *unicode)
610{
Victor Stinner910337b2011-10-03 03:20:16 +0200611 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200612 if (!PyUnicode_IS_COMPACT(unicode))
613 {
614 if (!PyUnicode_IS_READY(unicode))
615 return "wstr";
616 switch(PyUnicode_KIND(unicode))
617 {
618 case PyUnicode_1BYTE_KIND:
619 if (PyUnicode_IS_COMPACT_ASCII(unicode))
620 return "legacy ascii";
621 else
622 return "legacy latin1";
623 case PyUnicode_2BYTE_KIND:
624 return "legacy UCS2";
625 case PyUnicode_4BYTE_KIND:
626 return "legacy UCS4";
627 default:
628 return "<legacy invalid kind>";
629 }
630 }
631 assert(PyUnicode_IS_READY(unicode));
632 switch(PyUnicode_KIND(unicode))
633 {
634 case PyUnicode_1BYTE_KIND:
635 if (PyUnicode_IS_COMPACT_ASCII(unicode))
636 return "ascii";
637 else
638 return "compact latin1";
639 case PyUnicode_2BYTE_KIND:
640 return "compact UCS2";
641 case PyUnicode_4BYTE_KIND:
642 return "compact UCS4";
643 default:
644 return "<invalid compact kind>";
645 }
646}
647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648#ifdef Py_DEBUG
649int unicode_new_new_calls = 0;
650
651/* Functions wrapping macros for use in debugger */
652char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200653 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654}
655
656void *_PyUnicode_compact_data(void *unicode) {
657 return _PyUnicode_COMPACT_DATA(unicode);
658}
659void *_PyUnicode_data(void *unicode){
660 printf("obj %p\n", unicode);
661 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
662 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
663 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
664 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
665 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
666 return PyUnicode_DATA(unicode);
667}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200668
669void
670_PyUnicode_Dump(PyObject *op)
671{
672 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200673 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
674 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
675 void *data;
676 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
677 if (ascii->state.compact)
678 data = (compact + 1);
679 else
680 data = unicode->data.any;
681 if (ascii->wstr == data)
682 printf("shared ");
683 printf("wstr=%p", ascii->wstr);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200684 if (!ascii->state.ascii) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200685 printf(" (%zu), ", compact->wstr_length);
686 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
687 printf("shared ");
688 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200689 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200690 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200691}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200692#endif
693
694PyObject *
695PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
696{
697 PyObject *obj;
698 PyCompactUnicodeObject *unicode;
699 void *data;
700 int kind_state;
701 int is_sharing = 0, is_ascii = 0;
702 Py_ssize_t char_size;
703 Py_ssize_t struct_size;
704
705 /* Optimization for empty strings */
706 if (size == 0 && unicode_empty != NULL) {
707 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200708 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 }
710
711#ifdef Py_DEBUG
712 ++unicode_new_new_calls;
713#endif
714
715 struct_size = sizeof(PyCompactUnicodeObject);
716 if (maxchar < 128) {
717 kind_state = PyUnicode_1BYTE_KIND;
718 char_size = 1;
719 is_ascii = 1;
720 struct_size = sizeof(PyASCIIObject);
721 }
722 else if (maxchar < 256) {
723 kind_state = PyUnicode_1BYTE_KIND;
724 char_size = 1;
725 }
726 else if (maxchar < 65536) {
727 kind_state = PyUnicode_2BYTE_KIND;
728 char_size = 2;
729 if (sizeof(wchar_t) == 2)
730 is_sharing = 1;
731 }
732 else {
733 kind_state = PyUnicode_4BYTE_KIND;
734 char_size = 4;
735 if (sizeof(wchar_t) == 4)
736 is_sharing = 1;
737 }
738
739 /* Ensure we won't overflow the size. */
740 if (size < 0) {
741 PyErr_SetString(PyExc_SystemError,
742 "Negative size passed to PyUnicode_New");
743 return NULL;
744 }
745 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
746 return PyErr_NoMemory();
747
748 /* Duplicated allocation code from _PyObject_New() instead of a call to
749 * PyObject_New() so we are able to allocate space for the object and
750 * it's data buffer.
751 */
752 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
753 if (obj == NULL)
754 return PyErr_NoMemory();
755 obj = PyObject_INIT(obj, &PyUnicode_Type);
756 if (obj == NULL)
757 return NULL;
758
759 unicode = (PyCompactUnicodeObject *)obj;
760 if (is_ascii)
761 data = ((PyASCIIObject*)obj) + 1;
762 else
763 data = unicode + 1;
764 _PyUnicode_LENGTH(unicode) = size;
765 _PyUnicode_HASH(unicode) = -1;
766 _PyUnicode_STATE(unicode).interned = 0;
767 _PyUnicode_STATE(unicode).kind = kind_state;
768 _PyUnicode_STATE(unicode).compact = 1;
769 _PyUnicode_STATE(unicode).ready = 1;
770 _PyUnicode_STATE(unicode).ascii = is_ascii;
771 if (is_ascii) {
772 ((char*)data)[size] = 0;
773 _PyUnicode_WSTR(unicode) = NULL;
774 }
775 else if (kind_state == PyUnicode_1BYTE_KIND) {
776 ((char*)data)[size] = 0;
777 _PyUnicode_WSTR(unicode) = NULL;
778 _PyUnicode_WSTR_LENGTH(unicode) = 0;
779 unicode->utf8_length = 0;
780 unicode->utf8 = NULL;
781 }
782 else {
783 unicode->utf8 = NULL;
784 if (kind_state == PyUnicode_2BYTE_KIND)
785 ((Py_UCS2*)data)[size] = 0;
786 else /* kind_state == PyUnicode_4BYTE_KIND */
787 ((Py_UCS4*)data)[size] = 0;
788 if (is_sharing) {
789 _PyUnicode_WSTR_LENGTH(unicode) = size;
790 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
791 }
792 else {
793 _PyUnicode_WSTR_LENGTH(unicode) = 0;
794 _PyUnicode_WSTR(unicode) = NULL;
795 }
796 }
797 return obj;
798}
799
800#if SIZEOF_WCHAR_T == 2
801/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
802 will decode surrogate pairs, the other conversions are implemented as macros
803 for efficency.
804
805 This function assumes that unicode can hold one more code point than wstr
806 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200807static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
809 PyUnicodeObject *unicode)
810{
811 const wchar_t *iter;
812 Py_UCS4 *ucs4_out;
813
Victor Stinner910337b2011-10-03 03:20:16 +0200814 assert(unicode != NULL);
815 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200816 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
817 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
818
819 for (iter = begin; iter < end; ) {
820 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
821 _PyUnicode_GET_LENGTH(unicode)));
822 if (*iter >= 0xD800 && *iter <= 0xDBFF
823 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
824 {
825 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
826 iter += 2;
827 }
828 else {
829 *ucs4_out++ = *iter;
830 iter++;
831 }
832 }
833 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
834 _PyUnicode_GET_LENGTH(unicode)));
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836}
837#endif
838
Victor Stinnercd9950f2011-10-02 00:34:53 +0200839static int
840_PyUnicode_Dirty(PyObject *unicode)
841{
Victor Stinner910337b2011-10-03 03:20:16 +0200842 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200843 if (Py_REFCNT(unicode) != 1) {
844 PyErr_SetString(PyExc_ValueError,
845 "Cannot modify a string having more than 1 reference");
846 return -1;
847 }
848 _PyUnicode_DIRTY(unicode);
849 return 0;
850}
851
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200852Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200853PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
854 PyObject *from, Py_ssize_t from_start,
855 Py_ssize_t how_many)
856{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200857 unsigned int from_kind, to_kind;
858 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859
Victor Stinnerb1536152011-09-30 02:26:10 +0200860 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
861 PyErr_BadInternalCall();
862 return -1;
863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200864
865 if (PyUnicode_READY(from))
866 return -1;
867 if (PyUnicode_READY(to))
868 return -1;
869
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200870 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200871 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
872 PyErr_Format(PyExc_ValueError,
873 "Cannot write %zi characters at %zi "
874 "in a string of %zi characters",
875 how_many, to_start, PyUnicode_GET_LENGTH(to));
876 return -1;
877 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200878 if (how_many == 0)
879 return 0;
880
Victor Stinnercd9950f2011-10-02 00:34:53 +0200881 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200882 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200885 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200887 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 if (from_kind == to_kind
890 /* deny latin1 => ascii */
891 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
892 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200893 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200894 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200895 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200896 + PyUnicode_KIND_SIZE(from_kind, from_start),
897 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200899 else if (from_kind == PyUnicode_1BYTE_KIND
900 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200901 {
902 _PyUnicode_CONVERT_BYTES(
903 Py_UCS1, Py_UCS2,
904 PyUnicode_1BYTE_DATA(from) + from_start,
905 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
906 PyUnicode_2BYTE_DATA(to) + to_start
907 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200908 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200909 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200910 && to_kind == PyUnicode_4BYTE_KIND)
911 {
912 _PyUnicode_CONVERT_BYTES(
913 Py_UCS1, Py_UCS4,
914 PyUnicode_1BYTE_DATA(from) + from_start,
915 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
916 PyUnicode_4BYTE_DATA(to) + to_start
917 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200918 }
919 else if (from_kind == PyUnicode_2BYTE_KIND
920 && to_kind == PyUnicode_4BYTE_KIND)
921 {
922 _PyUnicode_CONVERT_BYTES(
923 Py_UCS2, Py_UCS4,
924 PyUnicode_2BYTE_DATA(from) + from_start,
925 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
926 PyUnicode_4BYTE_DATA(to) + to_start
927 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200928 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200929 else {
930 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200931
932 /* check if max_char(from substring) <= max_char(to) */
933 if (from_kind > to_kind
934 /* latin1 => ascii */
935 || (PyUnicode_IS_COMPACT_ASCII(to)
936 && to_kind == PyUnicode_1BYTE_KIND
937 && !PyUnicode_IS_COMPACT_ASCII(from)))
938 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200939 /* slow path to check for character overflow */
940 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
941 Py_UCS4 ch, maxchar;
942 Py_ssize_t i;
943
944 maxchar = 0;
945 invalid_kinds = 0;
946 for (i=0; i < how_many; i++) {
947 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
948 if (ch > maxchar) {
949 maxchar = ch;
950 if (maxchar > to_maxchar) {
951 invalid_kinds = 1;
952 break;
953 }
954 }
955 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
956 }
957 }
958 else
959 invalid_kinds = 1;
960 if (invalid_kinds) {
961 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 "Cannot copy %s characters "
963 "into a string of %s characters",
964 unicode_kind_name(from),
965 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 return -1;
967 }
968 }
969 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970}
971
Victor Stinner17222162011-09-28 22:15:37 +0200972/* Find the maximum code point and count the number of surrogate pairs so a
973 correct string length can be computed before converting a string to UCS4.
974 This function counts single surrogates as a character and not as a pair.
975
976 Return 0 on success, or -1 on error. */
977static int
978find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
979 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980{
981 const wchar_t *iter;
982
Victor Stinnerc53be962011-10-02 21:33:54 +0200983 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 if (num_surrogates == NULL || maxchar == NULL) {
985 PyErr_SetString(PyExc_SystemError,
986 "unexpected NULL arguments to "
987 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
988 return -1;
989 }
990
991 *num_surrogates = 0;
992 *maxchar = 0;
993
994 for (iter = begin; iter < end; ) {
995 if (*iter > *maxchar)
996 *maxchar = *iter;
997#if SIZEOF_WCHAR_T == 2
998 if (*iter >= 0xD800 && *iter <= 0xDBFF
999 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1000 {
1001 Py_UCS4 surrogate_val;
1002 surrogate_val = (((iter[0] & 0x3FF)<<10)
1003 | (iter[1] & 0x3FF)) + 0x10000;
1004 ++(*num_surrogates);
1005 if (surrogate_val > *maxchar)
1006 *maxchar = surrogate_val;
1007 iter += 2;
1008 }
1009 else
1010 iter++;
1011#else
1012 iter++;
1013#endif
1014 }
1015 return 0;
1016}
1017
1018#ifdef Py_DEBUG
1019int unicode_ready_calls = 0;
1020#endif
1021
1022int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001023_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001024{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001025 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001026 wchar_t *end;
1027 Py_UCS4 maxchar = 0;
1028 Py_ssize_t num_surrogates;
1029#if SIZEOF_WCHAR_T == 2
1030 Py_ssize_t length_wo_surrogates;
1031#endif
1032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001034 strings were created using _PyObject_New() and where no canonical
1035 representation (the str field) has been set yet aka strings
1036 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001037 assert(_PyUnicode_CHECK(unicode));
1038 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001040 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001041 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001042 /* Actually, it should neither be interned nor be anything else: */
1043 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044
1045#ifdef Py_DEBUG
1046 ++unicode_ready_calls;
1047#endif
1048
1049 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001050 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001051 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053
1054 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001055 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1056 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 PyErr_NoMemory();
1058 return -1;
1059 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001060 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 _PyUnicode_WSTR(unicode), end,
1062 PyUnicode_1BYTE_DATA(unicode));
1063 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1064 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1065 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1066 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001067 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001068 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 }
1070 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001071 _PyUnicode_UTF8(unicode) = NULL;
1072 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 }
1074 PyObject_FREE(_PyUnicode_WSTR(unicode));
1075 _PyUnicode_WSTR(unicode) = NULL;
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 }
1078 /* In this case we might have to convert down from 4-byte native
1079 wchar_t to 2-byte unicode. */
1080 else if (maxchar < 65536) {
1081 assert(num_surrogates == 0 &&
1082 "FindMaxCharAndNumSurrogatePairs() messed up");
1083
Victor Stinner506f5922011-09-28 22:34:18 +02001084#if SIZEOF_WCHAR_T == 2
1085 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001086 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001087 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1088 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1089 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001090 _PyUnicode_UTF8(unicode) = NULL;
1091 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001092#else
1093 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001094 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001095 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001096 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001097 PyErr_NoMemory();
1098 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099 }
Victor Stinner506f5922011-09-28 22:34:18 +02001100 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1101 _PyUnicode_WSTR(unicode), end,
1102 PyUnicode_2BYTE_DATA(unicode));
1103 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1104 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1105 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001106 _PyUnicode_UTF8(unicode) = NULL;
1107 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001108 PyObject_FREE(_PyUnicode_WSTR(unicode));
1109 _PyUnicode_WSTR(unicode) = NULL;
1110 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1111#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 }
1113 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1114 else {
1115#if SIZEOF_WCHAR_T == 2
1116 /* in case the native representation is 2-bytes, we need to allocate a
1117 new normalized 4-byte version. */
1118 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001119 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1120 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 PyErr_NoMemory();
1122 return -1;
1123 }
1124 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1125 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001126 _PyUnicode_UTF8(unicode) = NULL;
1127 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001128 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1129 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001130 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 PyObject_FREE(_PyUnicode_WSTR(unicode));
1132 _PyUnicode_WSTR(unicode) = NULL;
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134#else
1135 assert(num_surrogates == 0);
1136
Victor Stinnerc3c74152011-10-02 20:39:55 +02001137 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001139 _PyUnicode_UTF8(unicode) = NULL;
1140 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1142#endif
1143 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1144 }
1145 _PyUnicode_STATE(unicode).ready = 1;
1146 return 0;
1147}
1148
Alexander Belopolsky40018472011-02-26 01:02:56 +00001149static void
1150unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151{
Walter Dörwald16807132007-05-25 13:52:07 +00001152 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001153 case SSTATE_NOT_INTERNED:
1154 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001155
Benjamin Peterson29060642009-01-31 22:14:21 +00001156 case SSTATE_INTERNED_MORTAL:
1157 /* revive dead object temporarily for DelItem */
1158 Py_REFCNT(unicode) = 3;
1159 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1160 Py_FatalError(
1161 "deletion of interned string failed");
1162 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001163
Benjamin Peterson29060642009-01-31 22:14:21 +00001164 case SSTATE_INTERNED_IMMORTAL:
1165 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001166
Benjamin Peterson29060642009-01-31 22:14:21 +00001167 default:
1168 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001169 }
1170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 if (_PyUnicode_WSTR(unicode) &&
1172 (!PyUnicode_IS_READY(unicode) ||
1173 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1174 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001175 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001176 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001177
1178 if (PyUnicode_IS_COMPACT(unicode)) {
1179 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 }
1181 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001182 if (_PyUnicode_DATA_ANY(unicode))
1183 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001184 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
1186}
1187
Alexander Belopolsky40018472011-02-26 01:02:56 +00001188static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001189unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001190{
Victor Stinnera3be6132011-10-03 02:16:37 +02001191 Py_ssize_t len;
Victor Stinnerca4f7a42011-10-03 04:18:04 +02001192#if SIZEOF_WCHAR_T == 2
1193 /* FIXME: unicode_resize() is buggy on Windows */
1194 return 0;
1195#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001196 if (Py_REFCNT(unicode) != 1)
1197 return 0;
1198 if (PyUnicode_CHECK_INTERNED(unicode))
1199 return 0;
1200 if (unicode == unicode_empty)
1201 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001202 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1203 len = PyUnicode_WSTR_LENGTH(unicode);
1204 else
1205 len = PyUnicode_GET_LENGTH(unicode);
1206 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001207 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001208 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001209 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001210 else
1211 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001212 if (ch < 256 && unicode_latin1[ch] == unicode)
1213 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001214 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001215 /* FIXME: reenable resize_inplace */
1216 if (!PyUnicode_IS_COMPACT(unicode))
1217 return 0;
1218 return 1;
1219}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001220
Victor Stinnerfe226c02011-10-03 03:52:20 +02001221static int
1222unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1223{
1224 PyObject *unicode;
1225 Py_ssize_t old_length;
1226
1227 assert(p_unicode != NULL);
1228 unicode = *p_unicode;
1229
1230 assert(unicode != NULL);
1231 assert(PyUnicode_Check(unicode));
1232 assert(0 <= length);
1233
Victor Stinner910337b2011-10-03 03:20:16 +02001234 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001235 old_length = PyUnicode_WSTR_LENGTH(unicode);
1236 else
1237 old_length = PyUnicode_GET_LENGTH(unicode);
1238 if (old_length == length)
1239 return 0;
1240
1241 /* FIXME: really create a new object? */
1242 if (!unicode_resizable(unicode)) {
1243 PyObject *copy = resize_copy(unicode, length);
1244 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001245 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001246 Py_DECREF(*p_unicode);
1247 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001248 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001249 }
1250
Victor Stinnerfe226c02011-10-03 03:52:20 +02001251 if (PyUnicode_IS_COMPACT(unicode)) {
1252 *p_unicode = resize_compact(unicode, length);
1253 if (*p_unicode == NULL)
1254 return -1;
1255 return 0;
1256 } else
1257 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001258}
1259
Alexander Belopolsky40018472011-02-26 01:02:56 +00001260int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001261PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001262{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001263 PyObject *unicode;
1264 if (p_unicode == NULL) {
1265 PyErr_BadInternalCall();
1266 return -1;
1267 }
1268 unicode = *p_unicode;
1269 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1270 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1271 {
1272 PyErr_BadInternalCall();
1273 return -1;
1274 }
1275 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001276}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278static PyObject*
1279get_latin1_char(unsigned char ch)
1280{
Victor Stinnera464fc12011-10-02 20:39:30 +02001281 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001282 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001283 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284 if (!unicode)
1285 return NULL;
1286 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1287 unicode_latin1[ch] = unicode;
1288 }
1289 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001290 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291}
1292
Alexander Belopolsky40018472011-02-26 01:02:56 +00001293PyObject *
1294PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
1296 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297 Py_UCS4 maxchar = 0;
1298 Py_ssize_t num_surrogates;
1299
1300 if (u == NULL)
1301 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001303 /* If the Unicode data is known at construction time, we can apply
1304 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 /* Optimization for empty strings */
1307 if (size == 0 && unicode_empty != NULL) {
1308 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001309 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001310 }
Tim Petersced69f82003-09-16 20:30:58 +00001311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 /* Single character Unicode objects in the Latin-1 range are
1313 shared when using this constructor */
1314 if (size == 1 && *u < 256)
1315 return get_latin1_char((unsigned char)*u);
1316
1317 /* If not empty and not single character, copy the Unicode data
1318 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001319 if (find_maxchar_surrogates(u, u + size,
1320 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 return NULL;
1322
1323 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1324 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 if (!unicode)
1326 return NULL;
1327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 switch (PyUnicode_KIND(unicode)) {
1329 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001330 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1332 break;
1333 case PyUnicode_2BYTE_KIND:
1334#if Py_UNICODE_SIZE == 2
1335 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1336#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001337 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1339#endif
1340 break;
1341 case PyUnicode_4BYTE_KIND:
1342#if SIZEOF_WCHAR_T == 2
1343 /* This is the only case which has to process surrogates, thus
1344 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001345 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#else
1347 assert(num_surrogates == 0);
1348 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1349#endif
1350 break;
1351 default:
1352 assert(0 && "Impossible state");
1353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354
1355 return (PyObject *)unicode;
1356}
1357
Alexander Belopolsky40018472011-02-26 01:02:56 +00001358PyObject *
1359PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001360{
1361 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001362
Benjamin Peterson14339b62009-01-31 16:36:08 +00001363 if (size < 0) {
1364 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001366 return NULL;
1367 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001368
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001369 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001370 some optimizations which share commonly used objects.
1371 Also, this means the input must be UTF-8, so fall back to the
1372 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001373 if (u != NULL) {
1374
Benjamin Peterson29060642009-01-31 22:14:21 +00001375 /* Optimization for empty strings */
1376 if (size == 0 && unicode_empty != NULL) {
1377 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001378 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001380
1381 /* Single characters are shared when using this constructor.
1382 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 if (size == 1 && Py_CHARMASK(*u) < 128)
1384 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001385
1386 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001387 }
1388
Walter Dörwald55507312007-05-18 13:12:10 +00001389 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001390 if (!unicode)
1391 return NULL;
1392
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001393 return (PyObject *)unicode;
1394}
1395
Alexander Belopolsky40018472011-02-26 01:02:56 +00001396PyObject *
1397PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001398{
1399 size_t size = strlen(u);
1400 if (size > PY_SSIZE_T_MAX) {
1401 PyErr_SetString(PyExc_OverflowError, "input too long");
1402 return NULL;
1403 }
1404
1405 return PyUnicode_FromStringAndSize(u, size);
1406}
1407
Victor Stinnere57b1c02011-09-28 22:20:48 +02001408static PyObject*
1409_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 PyObject *res;
1412 unsigned char max = 127;
1413 Py_ssize_t i;
1414 for (i = 0; i < size; i++) {
1415 if (u[i] & 0x80) {
1416 max = 255;
1417 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001418 }
1419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 res = PyUnicode_New(size, max);
1421 if (!res)
1422 return NULL;
1423 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1424 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001425}
1426
Victor Stinnere57b1c02011-09-28 22:20:48 +02001427static PyObject*
1428_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429{
1430 PyObject *res;
1431 Py_UCS2 max = 0;
1432 Py_ssize_t i;
1433 for (i = 0; i < size; i++)
1434 if (u[i] > max)
1435 max = u[i];
1436 res = PyUnicode_New(size, max);
1437 if (!res)
1438 return NULL;
1439 if (max >= 256)
1440 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1441 else
1442 for (i = 0; i < size; i++)
1443 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1444 return res;
1445}
1446
Victor Stinnere57b1c02011-09-28 22:20:48 +02001447static PyObject*
1448_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449{
1450 PyObject *res;
1451 Py_UCS4 max = 0;
1452 Py_ssize_t i;
1453 for (i = 0; i < size; i++)
1454 if (u[i] > max)
1455 max = u[i];
1456 res = PyUnicode_New(size, max);
1457 if (!res)
1458 return NULL;
1459 if (max >= 0x10000)
1460 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1461 else {
1462 int kind = PyUnicode_KIND(res);
1463 void *data = PyUnicode_DATA(res);
1464 for (i = 0; i < size; i++)
1465 PyUnicode_WRITE(kind, data, i, u[i]);
1466 }
1467 return res;
1468}
1469
1470PyObject*
1471PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1472{
1473 switch(kind) {
1474 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001475 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001477 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001479 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001481 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 return NULL;
1483}
1484
Victor Stinner034f6cf2011-09-30 02:26:44 +02001485PyObject*
1486PyUnicode_Copy(PyObject *unicode)
1487{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001488 Py_ssize_t size;
1489 PyObject *copy;
1490 void *data;
1491
Victor Stinner034f6cf2011-09-30 02:26:44 +02001492 if (!PyUnicode_Check(unicode)) {
1493 PyErr_BadInternalCall();
1494 return NULL;
1495 }
1496 if (PyUnicode_READY(unicode))
1497 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001498
1499 size = PyUnicode_GET_LENGTH(unicode);
1500 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1501 if (!copy)
1502 return NULL;
1503 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1504
1505 data = PyUnicode_DATA(unicode);
1506 switch (PyUnicode_KIND(unicode))
1507 {
1508 case PyUnicode_1BYTE_KIND:
1509 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1510 break;
1511 case PyUnicode_2BYTE_KIND:
1512 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1513 break;
1514 case PyUnicode_4BYTE_KIND:
1515 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1516 break;
1517 default:
1518 assert(0);
1519 break;
1520 }
1521 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001522}
1523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524
Victor Stinnerbc603d12011-10-02 01:00:40 +02001525/* Widen Unicode objects to larger buffers. Don't write terminating null
1526 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527
1528void*
1529_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1530{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001531 Py_ssize_t len;
1532 void *result;
1533 unsigned int skind;
1534
1535 if (PyUnicode_READY(s))
1536 return NULL;
1537
1538 len = PyUnicode_GET_LENGTH(s);
1539 skind = PyUnicode_KIND(s);
1540 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001541 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1542 return NULL;
1543 }
1544 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001545 case PyUnicode_2BYTE_KIND:
1546 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1547 if (!result)
1548 return PyErr_NoMemory();
1549 assert(skind == PyUnicode_1BYTE_KIND);
1550 _PyUnicode_CONVERT_BYTES(
1551 Py_UCS1, Py_UCS2,
1552 PyUnicode_1BYTE_DATA(s),
1553 PyUnicode_1BYTE_DATA(s) + len,
1554 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001556 case PyUnicode_4BYTE_KIND:
1557 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1558 if (!result)
1559 return PyErr_NoMemory();
1560 if (skind == PyUnicode_2BYTE_KIND) {
1561 _PyUnicode_CONVERT_BYTES(
1562 Py_UCS2, Py_UCS4,
1563 PyUnicode_2BYTE_DATA(s),
1564 PyUnicode_2BYTE_DATA(s) + len,
1565 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001567 else {
1568 assert(skind == PyUnicode_1BYTE_KIND);
1569 _PyUnicode_CONVERT_BYTES(
1570 Py_UCS1, Py_UCS4,
1571 PyUnicode_1BYTE_DATA(s),
1572 PyUnicode_1BYTE_DATA(s) + len,
1573 result);
1574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001576 default:
1577 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001579 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 return NULL;
1581}
1582
1583static Py_UCS4*
1584as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1585 int copy_null)
1586{
1587 int kind;
1588 void *data;
1589 Py_ssize_t len, targetlen;
1590 if (PyUnicode_READY(string) == -1)
1591 return NULL;
1592 kind = PyUnicode_KIND(string);
1593 data = PyUnicode_DATA(string);
1594 len = PyUnicode_GET_LENGTH(string);
1595 targetlen = len;
1596 if (copy_null)
1597 targetlen++;
1598 if (!target) {
1599 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1600 PyErr_NoMemory();
1601 return NULL;
1602 }
1603 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1604 if (!target) {
1605 PyErr_NoMemory();
1606 return NULL;
1607 }
1608 }
1609 else {
1610 if (targetsize < targetlen) {
1611 PyErr_Format(PyExc_SystemError,
1612 "string is longer than the buffer");
1613 if (copy_null && 0 < targetsize)
1614 target[0] = 0;
1615 return NULL;
1616 }
1617 }
1618 if (kind != PyUnicode_4BYTE_KIND) {
1619 Py_ssize_t i;
1620 for (i = 0; i < len; i++)
1621 target[i] = PyUnicode_READ(kind, data, i);
1622 }
1623 else
1624 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1625 if (copy_null)
1626 target[len] = 0;
1627 return target;
1628}
1629
1630Py_UCS4*
1631PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1632 int copy_null)
1633{
1634 if (target == NULL || targetsize < 1) {
1635 PyErr_BadInternalCall();
1636 return NULL;
1637 }
1638 return as_ucs4(string, target, targetsize, copy_null);
1639}
1640
1641Py_UCS4*
1642PyUnicode_AsUCS4Copy(PyObject *string)
1643{
1644 return as_ucs4(string, NULL, 0, 1);
1645}
1646
1647#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001648
Alexander Belopolsky40018472011-02-26 01:02:56 +00001649PyObject *
1650PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001653 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001655 PyErr_BadInternalCall();
1656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 }
1658
Martin v. Löwis790465f2008-04-05 20:41:37 +00001659 if (size == -1) {
1660 size = wcslen(w);
1661 }
1662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664}
1665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001667
Walter Dörwald346737f2007-05-31 10:44:43 +00001668static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001669makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1670 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001671{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001672 *fmt++ = '%';
1673 if (width) {
1674 if (zeropad)
1675 *fmt++ = '0';
1676 fmt += sprintf(fmt, "%d", width);
1677 }
1678 if (precision)
1679 fmt += sprintf(fmt, ".%d", precision);
1680 if (longflag)
1681 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001682 else if (longlongflag) {
1683 /* longlongflag should only ever be nonzero on machines with
1684 HAVE_LONG_LONG defined */
1685#ifdef HAVE_LONG_LONG
1686 char *f = PY_FORMAT_LONG_LONG;
1687 while (*f)
1688 *fmt++ = *f++;
1689#else
1690 /* we shouldn't ever get here */
1691 assert(0);
1692 *fmt++ = 'l';
1693#endif
1694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001695 else if (size_tflag) {
1696 char *f = PY_FORMAT_SIZE_T;
1697 while (*f)
1698 *fmt++ = *f++;
1699 }
1700 *fmt++ = c;
1701 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001702}
1703
Victor Stinner96865452011-03-01 23:44:09 +00001704/* helper for PyUnicode_FromFormatV() */
1705
1706static const char*
1707parse_format_flags(const char *f,
1708 int *p_width, int *p_precision,
1709 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1710{
1711 int width, precision, longflag, longlongflag, size_tflag;
1712
1713 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1714 f++;
1715 width = 0;
1716 while (Py_ISDIGIT((unsigned)*f))
1717 width = (width*10) + *f++ - '0';
1718 precision = 0;
1719 if (*f == '.') {
1720 f++;
1721 while (Py_ISDIGIT((unsigned)*f))
1722 precision = (precision*10) + *f++ - '0';
1723 if (*f == '%') {
1724 /* "%.3%s" => f points to "3" */
1725 f--;
1726 }
1727 }
1728 if (*f == '\0') {
1729 /* bogus format "%.1" => go backward, f points to "1" */
1730 f--;
1731 }
1732 if (p_width != NULL)
1733 *p_width = width;
1734 if (p_precision != NULL)
1735 *p_precision = precision;
1736
1737 /* Handle %ld, %lu, %lld and %llu. */
1738 longflag = 0;
1739 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001740 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001741
1742 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001743 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001744 longflag = 1;
1745 ++f;
1746 }
1747#ifdef HAVE_LONG_LONG
1748 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001749 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001750 longlongflag = 1;
1751 f += 2;
1752 }
1753#endif
1754 }
1755 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001756 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001757 size_tflag = 1;
1758 ++f;
1759 }
1760 if (p_longflag != NULL)
1761 *p_longflag = longflag;
1762 if (p_longlongflag != NULL)
1763 *p_longlongflag = longlongflag;
1764 if (p_size_tflag != NULL)
1765 *p_size_tflag = size_tflag;
1766 return f;
1767}
1768
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001769/* maximum number of characters required for output of %ld. 21 characters
1770 allows for 64-bit integers (in decimal) and an optional sign. */
1771#define MAX_LONG_CHARS 21
1772/* maximum number of characters required for output of %lld.
1773 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1774 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1775#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1776
Walter Dörwaldd2034312007-05-18 16:29:38 +00001777PyObject *
1778PyUnicode_FromFormatV(const char *format, va_list vargs)
1779{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001780 va_list count;
1781 Py_ssize_t callcount = 0;
1782 PyObject **callresults = NULL;
1783 PyObject **callresult = NULL;
1784 Py_ssize_t n = 0;
1785 int width = 0;
1786 int precision = 0;
1787 int zeropad;
1788 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001791 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1793 Py_UCS4 argmaxchar;
1794 Py_ssize_t numbersize = 0;
1795 char *numberresults = NULL;
1796 char *numberresult = NULL;
1797 Py_ssize_t i;
1798 int kind;
1799 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001800
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001801 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001802 /* step 1: count the number of %S/%R/%A/%s format specifications
1803 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1804 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 * result in an array)
1806 * also esimate a upper bound for all the number formats in the string,
1807 * numbers will be formated in step 3 and be keept in a '\0'-separated
1808 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 for (f = format; *f; f++) {
1810 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001811 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1813 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1814 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1815 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001818#ifdef HAVE_LONG_LONG
1819 if (longlongflag) {
1820 if (width < MAX_LONG_LONG_CHARS)
1821 width = MAX_LONG_LONG_CHARS;
1822 }
1823 else
1824#endif
1825 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1826 including sign. Decimal takes the most space. This
1827 isn't enough for octal. If a width is specified we
1828 need more (which we allocate later). */
1829 if (width < MAX_LONG_CHARS)
1830 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831
1832 /* account for the size + '\0' to separate numbers
1833 inside of the numberresults buffer */
1834 numbersize += (width + 1);
1835 }
1836 }
1837 else if ((unsigned char)*f > 127) {
1838 PyErr_Format(PyExc_ValueError,
1839 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1840 "string, got a non-ASCII byte: 0x%02x",
1841 (unsigned char)*f);
1842 return NULL;
1843 }
1844 }
1845 /* step 2: allocate memory for the results of
1846 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1847 if (callcount) {
1848 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1849 if (!callresults) {
1850 PyErr_NoMemory();
1851 return NULL;
1852 }
1853 callresult = callresults;
1854 }
1855 /* step 2.5: allocate memory for the results of formating numbers */
1856 if (numbersize) {
1857 numberresults = PyObject_Malloc(numbersize);
1858 if (!numberresults) {
1859 PyErr_NoMemory();
1860 goto fail;
1861 }
1862 numberresult = numberresults;
1863 }
1864
1865 /* step 3: format numbers and figure out how large a buffer we need */
1866 for (f = format; *f; f++) {
1867 if (*f == '%') {
1868 const char* p;
1869 int longflag;
1870 int longlongflag;
1871 int size_tflag;
1872 int numprinted;
1873
1874 p = f;
1875 zeropad = (f[1] == '0');
1876 f = parse_format_flags(f, &width, &precision,
1877 &longflag, &longlongflag, &size_tflag);
1878 switch (*f) {
1879 case 'c':
1880 {
1881 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001882 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 n++;
1884 break;
1885 }
1886 case '%':
1887 n++;
1888 break;
1889 case 'i':
1890 case 'd':
1891 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1892 width, precision, *f);
1893 if (longflag)
1894 numprinted = sprintf(numberresult, fmt,
1895 va_arg(count, long));
1896#ifdef HAVE_LONG_LONG
1897 else if (longlongflag)
1898 numprinted = sprintf(numberresult, fmt,
1899 va_arg(count, PY_LONG_LONG));
1900#endif
1901 else if (size_tflag)
1902 numprinted = sprintf(numberresult, fmt,
1903 va_arg(count, Py_ssize_t));
1904 else
1905 numprinted = sprintf(numberresult, fmt,
1906 va_arg(count, int));
1907 n += numprinted;
1908 /* advance by +1 to skip over the '\0' */
1909 numberresult += (numprinted + 1);
1910 assert(*(numberresult - 1) == '\0');
1911 assert(*(numberresult - 2) != '\0');
1912 assert(numprinted >= 0);
1913 assert(numberresult <= numberresults + numbersize);
1914 break;
1915 case 'u':
1916 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1917 width, precision, 'u');
1918 if (longflag)
1919 numprinted = sprintf(numberresult, fmt,
1920 va_arg(count, unsigned long));
1921#ifdef HAVE_LONG_LONG
1922 else if (longlongflag)
1923 numprinted = sprintf(numberresult, fmt,
1924 va_arg(count, unsigned PY_LONG_LONG));
1925#endif
1926 else if (size_tflag)
1927 numprinted = sprintf(numberresult, fmt,
1928 va_arg(count, size_t));
1929 else
1930 numprinted = sprintf(numberresult, fmt,
1931 va_arg(count, unsigned int));
1932 n += numprinted;
1933 numberresult += (numprinted + 1);
1934 assert(*(numberresult - 1) == '\0');
1935 assert(*(numberresult - 2) != '\0');
1936 assert(numprinted >= 0);
1937 assert(numberresult <= numberresults + numbersize);
1938 break;
1939 case 'x':
1940 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1941 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1942 n += numprinted;
1943 numberresult += (numprinted + 1);
1944 assert(*(numberresult - 1) == '\0');
1945 assert(*(numberresult - 2) != '\0');
1946 assert(numprinted >= 0);
1947 assert(numberresult <= numberresults + numbersize);
1948 break;
1949 case 'p':
1950 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1951 /* %p is ill-defined: ensure leading 0x. */
1952 if (numberresult[1] == 'X')
1953 numberresult[1] = 'x';
1954 else if (numberresult[1] != 'x') {
1955 memmove(numberresult + 2, numberresult,
1956 strlen(numberresult) + 1);
1957 numberresult[0] = '0';
1958 numberresult[1] = 'x';
1959 numprinted += 2;
1960 }
1961 n += numprinted;
1962 numberresult += (numprinted + 1);
1963 assert(*(numberresult - 1) == '\0');
1964 assert(*(numberresult - 2) != '\0');
1965 assert(numprinted >= 0);
1966 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001967 break;
1968 case 's':
1969 {
1970 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001971 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001972 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1973 if (!str)
1974 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 /* since PyUnicode_DecodeUTF8 returns already flexible
1976 unicode objects, there is no need to call ready on them */
1977 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001978 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001980 /* Remember the str and switch to the next slot */
1981 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001982 break;
1983 }
1984 case 'U':
1985 {
1986 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001987 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 if (PyUnicode_READY(obj) == -1)
1989 goto fail;
1990 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001991 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001993 break;
1994 }
1995 case 'V':
1996 {
1997 PyObject *obj = va_arg(count, PyObject *);
1998 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001999 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002000 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002001 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002002 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 if (PyUnicode_READY(obj) == -1)
2004 goto fail;
2005 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002006 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002008 *callresult++ = NULL;
2009 }
2010 else {
2011 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2012 if (!str_obj)
2013 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002015 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002017 *callresult++ = str_obj;
2018 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002019 break;
2020 }
2021 case 'S':
2022 {
2023 PyObject *obj = va_arg(count, PyObject *);
2024 PyObject *str;
2025 assert(obj);
2026 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002028 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002029 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002030 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002032 /* Remember the str and switch to the next slot */
2033 *callresult++ = str;
2034 break;
2035 }
2036 case 'R':
2037 {
2038 PyObject *obj = va_arg(count, PyObject *);
2039 PyObject *repr;
2040 assert(obj);
2041 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002043 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002045 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002047 /* Remember the repr and switch to the next slot */
2048 *callresult++ = repr;
2049 break;
2050 }
2051 case 'A':
2052 {
2053 PyObject *obj = va_arg(count, PyObject *);
2054 PyObject *ascii;
2055 assert(obj);
2056 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002060 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002062 /* Remember the repr and switch to the next slot */
2063 *callresult++ = ascii;
2064 break;
2065 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002066 default:
2067 /* if we stumble upon an unknown
2068 formatting code, copy the rest of
2069 the format string to the output
2070 string. (we cannot just skip the
2071 code, since there's no way to know
2072 what's in the argument list) */
2073 n += strlen(p);
2074 goto expand;
2075 }
2076 } else
2077 n++;
2078 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002079 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002082 we don't have to resize the string.
2083 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002085 if (!string)
2086 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 kind = PyUnicode_KIND(string);
2088 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002093 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002094 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002095
2096 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2098 /* checking for == because the last argument could be a empty
2099 string, which causes i to point to end, the assert at the end of
2100 the loop */
2101 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002102
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 switch (*f) {
2104 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002105 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 const int ordinal = va_arg(vargs, int);
2107 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002108 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002109 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002110 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002111 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002112 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002113 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114 case 'p':
2115 /* unused, since we already have the result */
2116 if (*f == 'p')
2117 (void) va_arg(vargs, void *);
2118 else
2119 (void) va_arg(vargs, int);
2120 /* extract the result from numberresults and append. */
2121 for (; *numberresult; ++i, ++numberresult)
2122 PyUnicode_WRITE(kind, data, i, *numberresult);
2123 /* skip over the separating '\0' */
2124 assert(*numberresult == '\0');
2125 numberresult++;
2126 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002127 break;
2128 case 's':
2129 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002130 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002132 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 size = PyUnicode_GET_LENGTH(*callresult);
2134 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002135 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2136 *callresult, 0,
2137 size) < 0)
2138 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002140 /* We're done with the unicode()/repr() => forget it */
2141 Py_DECREF(*callresult);
2142 /* switch to next unicode()/repr() result */
2143 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 break;
2145 }
2146 case 'U':
2147 {
2148 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 Py_ssize_t size;
2150 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2151 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002152 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2153 obj, 0,
2154 size) < 0)
2155 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002156 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002157 break;
2158 }
2159 case 'V':
2160 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002162 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002163 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002164 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 size = PyUnicode_GET_LENGTH(obj);
2166 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002167 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2168 obj, 0,
2169 size) < 0)
2170 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002172 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 size = PyUnicode_GET_LENGTH(*callresult);
2174 assert(PyUnicode_KIND(*callresult) <=
2175 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002176 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2177 *callresult,
2178 0, size) < 0)
2179 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002181 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002182 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002183 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002184 break;
2185 }
2186 case 'S':
2187 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002188 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002190 /* unused, since we already have the result */
2191 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002193 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2194 *callresult, 0,
2195 PyUnicode_GET_LENGTH(*callresult)) < 0)
2196 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 /* We're done with the unicode()/repr() => forget it */
2199 Py_DECREF(*callresult);
2200 /* switch to next unicode()/repr() result */
2201 ++callresult;
2202 break;
2203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002204 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 break;
2207 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 for (; *p; ++p, ++i)
2209 PyUnicode_WRITE(kind, data, i, *p);
2210 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 goto end;
2212 }
Victor Stinner1205f272010-09-11 00:54:47 +00002213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 else {
2215 assert(i < PyUnicode_GET_LENGTH(string));
2216 PyUnicode_WRITE(kind, data, i++, *f);
2217 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002220
Benjamin Peterson29060642009-01-31 22:14:21 +00002221 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 if (callresults)
2223 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 if (numberresults)
2225 PyObject_Free(numberresults);
2226 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002227 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 if (callresults) {
2229 PyObject **callresult2 = callresults;
2230 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002231 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 ++callresult2;
2233 }
2234 PyObject_Free(callresults);
2235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 if (numberresults)
2237 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002238 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239}
2240
Walter Dörwaldd2034312007-05-18 16:29:38 +00002241PyObject *
2242PyUnicode_FromFormat(const char *format, ...)
2243{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 PyObject* ret;
2245 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002246
2247#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002249#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002251#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 ret = PyUnicode_FromFormatV(format, vargs);
2253 va_end(vargs);
2254 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002255}
2256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257#ifdef HAVE_WCHAR_H
2258
Victor Stinner5593d8a2010-10-02 11:11:27 +00002259/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2260 convert a Unicode object to a wide character string.
2261
Victor Stinnerd88d9832011-09-06 02:00:05 +02002262 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002263 character) required to convert the unicode object. Ignore size argument.
2264
Victor Stinnerd88d9832011-09-06 02:00:05 +02002265 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002266 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002267 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002268static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002269unicode_aswidechar(PyUnicodeObject *unicode,
2270 wchar_t *w,
2271 Py_ssize_t size)
2272{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002273 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 const wchar_t *wstr;
2275
2276 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2277 if (wstr == NULL)
2278 return -1;
2279
Victor Stinner5593d8a2010-10-02 11:11:27 +00002280 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002281 if (size > res)
2282 size = res + 1;
2283 else
2284 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002286 return res;
2287 }
2288 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002290}
2291
2292Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002293PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002294 wchar_t *w,
2295 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296{
2297 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002298 PyErr_BadInternalCall();
2299 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002301 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302}
2303
Victor Stinner137c34c2010-09-29 10:25:54 +00002304wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002305PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002306 Py_ssize_t *size)
2307{
2308 wchar_t* buffer;
2309 Py_ssize_t buflen;
2310
2311 if (unicode == NULL) {
2312 PyErr_BadInternalCall();
2313 return NULL;
2314 }
2315
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002316 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 if (buflen == -1)
2318 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002319 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002320 PyErr_NoMemory();
2321 return NULL;
2322 }
2323
Victor Stinner137c34c2010-09-29 10:25:54 +00002324 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2325 if (buffer == NULL) {
2326 PyErr_NoMemory();
2327 return NULL;
2328 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002329 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330 if (buflen == -1)
2331 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002332 if (size != NULL)
2333 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002334 return buffer;
2335}
2336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002338
Alexander Belopolsky40018472011-02-26 01:02:56 +00002339PyObject *
2340PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002341{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002343 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002344 PyErr_SetString(PyExc_ValueError,
2345 "chr() arg not in range(0x110000)");
2346 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002347 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002349 if (ordinal < 256)
2350 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 v = PyUnicode_New(1, ordinal);
2353 if (v == NULL)
2354 return NULL;
2355 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2356 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002357}
2358
Alexander Belopolsky40018472011-02-26 01:02:56 +00002359PyObject *
2360PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002361{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002362 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002363 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002364 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002365 if (PyUnicode_READY(obj))
2366 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 Py_INCREF(obj);
2368 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002369 }
2370 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002371 /* For a Unicode subtype that's not a Unicode object,
2372 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002373 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002374 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002375 PyErr_Format(PyExc_TypeError,
2376 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002377 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002378 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002379}
2380
Alexander Belopolsky40018472011-02-26 01:02:56 +00002381PyObject *
2382PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002383 const char *encoding,
2384 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002385{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002386 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002387 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002388
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002390 PyErr_BadInternalCall();
2391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002392 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002393
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002394 /* Decoding bytes objects is the most common case and should be fast */
2395 if (PyBytes_Check(obj)) {
2396 if (PyBytes_GET_SIZE(obj) == 0) {
2397 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002398 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002399 }
2400 else {
2401 v = PyUnicode_Decode(
2402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2403 encoding, errors);
2404 }
2405 return v;
2406 }
2407
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002408 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002409 PyErr_SetString(PyExc_TypeError,
2410 "decoding str is not supported");
2411 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002412 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2416 PyErr_Format(PyExc_TypeError,
2417 "coercing to str: need bytes, bytearray "
2418 "or buffer-like object, %.80s found",
2419 Py_TYPE(obj)->tp_name);
2420 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002421 }
Tim Petersced69f82003-09-16 20:30:58 +00002422
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002423 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002425 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002426 }
Tim Petersced69f82003-09-16 20:30:58 +00002427 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002429
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002430 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002431 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432}
2433
Victor Stinner600d3be2010-06-10 12:00:55 +00002434/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2436 1 on success. */
2437static int
2438normalize_encoding(const char *encoding,
2439 char *lower,
2440 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002442 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002443 char *l;
2444 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002445
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002446 e = encoding;
2447 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002448 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002449 while (*e) {
2450 if (l == l_end)
2451 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002452 if (Py_ISUPPER(*e)) {
2453 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002454 }
2455 else if (*e == '_') {
2456 *l++ = '-';
2457 e++;
2458 }
2459 else {
2460 *l++ = *e++;
2461 }
2462 }
2463 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002464 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002465}
2466
Alexander Belopolsky40018472011-02-26 01:02:56 +00002467PyObject *
2468PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002469 Py_ssize_t size,
2470 const char *encoding,
2471 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002472{
2473 PyObject *buffer = NULL, *unicode;
2474 Py_buffer info;
2475 char lower[11]; /* Enough for any encoding shortcut */
2476
2477 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002478 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002479
2480 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002481 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002482 if ((strcmp(lower, "utf-8") == 0) ||
2483 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002484 return PyUnicode_DecodeUTF8(s, size, errors);
2485 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002486 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002487 (strcmp(lower, "iso-8859-1") == 0))
2488 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002489#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002490 else if (strcmp(lower, "mbcs") == 0)
2491 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002492#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002493 else if (strcmp(lower, "ascii") == 0)
2494 return PyUnicode_DecodeASCII(s, size, errors);
2495 else if (strcmp(lower, "utf-16") == 0)
2496 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2497 else if (strcmp(lower, "utf-32") == 0)
2498 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
2501 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002502 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002503 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002504 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002505 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506 if (buffer == NULL)
2507 goto onError;
2508 unicode = PyCodec_Decode(buffer, encoding, errors);
2509 if (unicode == NULL)
2510 goto onError;
2511 if (!PyUnicode_Check(unicode)) {
2512 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002513 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002514 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 Py_DECREF(unicode);
2516 goto onError;
2517 }
2518 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 if (PyUnicode_READY(unicode)) {
2520 Py_DECREF(unicode);
2521 return NULL;
2522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002524
Benjamin Peterson29060642009-01-31 22:14:21 +00002525 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 Py_XDECREF(buffer);
2527 return NULL;
2528}
2529
Alexander Belopolsky40018472011-02-26 01:02:56 +00002530PyObject *
2531PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002532 const char *encoding,
2533 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002534{
2535 PyObject *v;
2536
2537 if (!PyUnicode_Check(unicode)) {
2538 PyErr_BadArgument();
2539 goto onError;
2540 }
2541
2542 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002544
2545 /* Decode via the codec registry */
2546 v = PyCodec_Decode(unicode, encoding, errors);
2547 if (v == NULL)
2548 goto onError;
2549 return v;
2550
Benjamin Peterson29060642009-01-31 22:14:21 +00002551 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002552 return NULL;
2553}
2554
Alexander Belopolsky40018472011-02-26 01:02:56 +00002555PyObject *
2556PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002557 const char *encoding,
2558 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002559{
2560 PyObject *v;
2561
2562 if (!PyUnicode_Check(unicode)) {
2563 PyErr_BadArgument();
2564 goto onError;
2565 }
2566
2567 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002568 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002569
2570 /* Decode via the codec registry */
2571 v = PyCodec_Decode(unicode, encoding, errors);
2572 if (v == NULL)
2573 goto onError;
2574 if (!PyUnicode_Check(v)) {
2575 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002576 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002577 Py_TYPE(v)->tp_name);
2578 Py_DECREF(v);
2579 goto onError;
2580 }
2581 return v;
2582
Benjamin Peterson29060642009-01-31 22:14:21 +00002583 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002584 return NULL;
2585}
2586
Alexander Belopolsky40018472011-02-26 01:02:56 +00002587PyObject *
2588PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002589 Py_ssize_t size,
2590 const char *encoding,
2591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592{
2593 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002594
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595 unicode = PyUnicode_FromUnicode(s, size);
2596 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2599 Py_DECREF(unicode);
2600 return v;
2601}
2602
Alexander Belopolsky40018472011-02-26 01:02:56 +00002603PyObject *
2604PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002605 const char *encoding,
2606 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002607{
2608 PyObject *v;
2609
2610 if (!PyUnicode_Check(unicode)) {
2611 PyErr_BadArgument();
2612 goto onError;
2613 }
2614
2615 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002616 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002617
2618 /* Encode via the codec registry */
2619 v = PyCodec_Encode(unicode, encoding, errors);
2620 if (v == NULL)
2621 goto onError;
2622 return v;
2623
Benjamin Peterson29060642009-01-31 22:14:21 +00002624 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002625 return NULL;
2626}
2627
Victor Stinnerad158722010-10-27 00:25:46 +00002628PyObject *
2629PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002630{
Victor Stinner99b95382011-07-04 14:23:54 +02002631#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002632 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2633 PyUnicode_GET_SIZE(unicode),
2634 NULL);
2635#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002637#else
Victor Stinner793b5312011-04-27 00:24:21 +02002638 PyInterpreterState *interp = PyThreadState_GET()->interp;
2639 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2640 cannot use it to encode and decode filenames before it is loaded. Load
2641 the Python codec requires to encode at least its own filename. Use the C
2642 version of the locale codec until the codec registry is initialized and
2643 the Python codec is loaded.
2644
2645 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2646 cannot only rely on it: check also interp->fscodec_initialized for
2647 subinterpreters. */
2648 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002649 return PyUnicode_AsEncodedString(unicode,
2650 Py_FileSystemDefaultEncoding,
2651 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002652 }
2653 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002654 /* locale encoding with surrogateescape */
2655 wchar_t *wchar;
2656 char *bytes;
2657 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002658 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002659
2660 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2661 if (wchar == NULL)
2662 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002663 bytes = _Py_wchar2char(wchar, &error_pos);
2664 if (bytes == NULL) {
2665 if (error_pos != (size_t)-1) {
2666 char *errmsg = strerror(errno);
2667 PyObject *exc = NULL;
2668 if (errmsg == NULL)
2669 errmsg = "Py_wchar2char() failed";
2670 raise_encode_exception(&exc,
2671 "filesystemencoding",
2672 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2673 error_pos, error_pos+1,
2674 errmsg);
2675 Py_XDECREF(exc);
2676 }
2677 else
2678 PyErr_NoMemory();
2679 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002680 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002681 }
2682 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002683
2684 bytes_obj = PyBytes_FromString(bytes);
2685 PyMem_Free(bytes);
2686 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002687 }
Victor Stinnerad158722010-10-27 00:25:46 +00002688#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002689}
2690
Alexander Belopolsky40018472011-02-26 01:02:56 +00002691PyObject *
2692PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002693 const char *encoding,
2694 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695{
2696 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002697 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002698
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 if (!PyUnicode_Check(unicode)) {
2700 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702 }
Fred Drakee4315f52000-05-09 19:53:39 +00002703
Victor Stinner2f283c22011-03-02 01:21:46 +00002704 if (encoding == NULL) {
2705 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002707 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002709 }
Fred Drakee4315f52000-05-09 19:53:39 +00002710
2711 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002712 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002713 if ((strcmp(lower, "utf-8") == 0) ||
2714 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002715 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002716 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002717 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002718 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002720 }
Victor Stinner37296e82010-06-10 13:36:23 +00002721 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002722 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002723 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002725#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002726 else if (strcmp(lower, "mbcs") == 0)
2727 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2728 PyUnicode_GET_SIZE(unicode),
2729 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002730#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002731 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734
2735 /* Encode via the codec registry */
2736 v = PyCodec_Encode(unicode, encoding, errors);
2737 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002738 return NULL;
2739
2740 /* The normal path */
2741 if (PyBytes_Check(v))
2742 return v;
2743
2744 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002745 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002746 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002747 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002748
2749 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2750 "encoder %s returned bytearray instead of bytes",
2751 encoding);
2752 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002753 Py_DECREF(v);
2754 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002755 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002756
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002757 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2758 Py_DECREF(v);
2759 return b;
2760 }
2761
2762 PyErr_Format(PyExc_TypeError,
2763 "encoder did not return a bytes object (type=%.400s)",
2764 Py_TYPE(v)->tp_name);
2765 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002766 return NULL;
2767}
2768
Alexander Belopolsky40018472011-02-26 01:02:56 +00002769PyObject *
2770PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002771 const char *encoding,
2772 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002773{
2774 PyObject *v;
2775
2776 if (!PyUnicode_Check(unicode)) {
2777 PyErr_BadArgument();
2778 goto onError;
2779 }
2780
2781 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002783
2784 /* Encode via the codec registry */
2785 v = PyCodec_Encode(unicode, encoding, errors);
2786 if (v == NULL)
2787 goto onError;
2788 if (!PyUnicode_Check(v)) {
2789 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002790 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002791 Py_TYPE(v)->tp_name);
2792 Py_DECREF(v);
2793 goto onError;
2794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002796
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 return NULL;
2799}
2800
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002801PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002802PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002803 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002804 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2805}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002806
Christian Heimes5894ba72007-11-04 11:43:14 +00002807PyObject*
2808PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2809{
Victor Stinner99b95382011-07-04 14:23:54 +02002810#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002811 return PyUnicode_DecodeMBCS(s, size, NULL);
2812#elif defined(__APPLE__)
2813 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2814#else
Victor Stinner793b5312011-04-27 00:24:21 +02002815 PyInterpreterState *interp = PyThreadState_GET()->interp;
2816 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2817 cannot use it to encode and decode filenames before it is loaded. Load
2818 the Python codec requires to encode at least its own filename. Use the C
2819 version of the locale codec until the codec registry is initialized and
2820 the Python codec is loaded.
2821
2822 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2823 cannot only rely on it: check also interp->fscodec_initialized for
2824 subinterpreters. */
2825 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002826 return PyUnicode_Decode(s, size,
2827 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002828 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002829 }
2830 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002831 /* locale encoding with surrogateescape */
2832 wchar_t *wchar;
2833 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002834 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002835
2836 if (s[size] != '\0' || size != strlen(s)) {
2837 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2838 return NULL;
2839 }
2840
Victor Stinner168e1172010-10-16 23:16:16 +00002841 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002842 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002843 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002844
Victor Stinner168e1172010-10-16 23:16:16 +00002845 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002846 PyMem_Free(wchar);
2847 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002848 }
Victor Stinnerad158722010-10-27 00:25:46 +00002849#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002850}
2851
Martin v. Löwis011e8422009-05-05 04:43:17 +00002852
2853int
2854PyUnicode_FSConverter(PyObject* arg, void* addr)
2855{
2856 PyObject *output = NULL;
2857 Py_ssize_t size;
2858 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002859 if (arg == NULL) {
2860 Py_DECREF(*(PyObject**)addr);
2861 return 1;
2862 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002863 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002864 output = arg;
2865 Py_INCREF(output);
2866 }
2867 else {
2868 arg = PyUnicode_FromObject(arg);
2869 if (!arg)
2870 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002871 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002872 Py_DECREF(arg);
2873 if (!output)
2874 return 0;
2875 if (!PyBytes_Check(output)) {
2876 Py_DECREF(output);
2877 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2878 return 0;
2879 }
2880 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002881 size = PyBytes_GET_SIZE(output);
2882 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002883 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002884 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002885 Py_DECREF(output);
2886 return 0;
2887 }
2888 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002889 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002890}
2891
2892
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002893int
2894PyUnicode_FSDecoder(PyObject* arg, void* addr)
2895{
2896 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002897 if (arg == NULL) {
2898 Py_DECREF(*(PyObject**)addr);
2899 return 1;
2900 }
2901 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 if (PyUnicode_READY(arg))
2903 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002904 output = arg;
2905 Py_INCREF(output);
2906 }
2907 else {
2908 arg = PyBytes_FromObject(arg);
2909 if (!arg)
2910 return 0;
2911 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2912 PyBytes_GET_SIZE(arg));
2913 Py_DECREF(arg);
2914 if (!output)
2915 return 0;
2916 if (!PyUnicode_Check(output)) {
2917 Py_DECREF(output);
2918 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2919 return 0;
2920 }
2921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2923 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002924 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2925 Py_DECREF(output);
2926 return 0;
2927 }
2928 *(PyObject**)addr = output;
2929 return Py_CLEANUP_SUPPORTED;
2930}
2931
2932
Martin v. Löwis5b222132007-06-10 09:51:05 +00002933char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002934PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002935{
Christian Heimesf3863112007-11-22 07:46:41 +00002936 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2938
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002939 if (!PyUnicode_Check(unicode)) {
2940 PyErr_BadArgument();
2941 return NULL;
2942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002944 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002946 if (PyUnicode_UTF8(unicode) == NULL) {
2947 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2949 if (bytes == NULL)
2950 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002951 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2952 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953 Py_DECREF(bytes);
2954 return NULL;
2955 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002956 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2957 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 Py_DECREF(bytes);
2959 }
2960
2961 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002962 *psize = PyUnicode_UTF8_LENGTH(unicode);
2963 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002964}
2965
2966char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002967PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002969 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2970}
2971
2972#ifdef Py_DEBUG
2973int unicode_as_unicode_calls = 0;
2974#endif
2975
2976
2977Py_UNICODE *
2978PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2979{
2980 PyUnicodeObject *u;
2981 const unsigned char *one_byte;
2982#if SIZEOF_WCHAR_T == 4
2983 const Py_UCS2 *two_bytes;
2984#else
2985 const Py_UCS4 *four_bytes;
2986 const Py_UCS4 *ucs4_end;
2987 Py_ssize_t num_surrogates;
2988#endif
2989 wchar_t *w;
2990 wchar_t *wchar_end;
2991
2992 if (!PyUnicode_Check(unicode)) {
2993 PyErr_BadArgument();
2994 return NULL;
2995 }
2996 u = (PyUnicodeObject*)unicode;
2997 if (_PyUnicode_WSTR(u) == NULL) {
2998 /* Non-ASCII compact unicode object */
2999 assert(_PyUnicode_KIND(u) != 0);
3000 assert(PyUnicode_IS_READY(u));
3001
3002#ifdef Py_DEBUG
3003 ++unicode_as_unicode_calls;
3004#endif
3005
3006 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3007#if SIZEOF_WCHAR_T == 2
3008 four_bytes = PyUnicode_4BYTE_DATA(u);
3009 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3010 num_surrogates = 0;
3011
3012 for (; four_bytes < ucs4_end; ++four_bytes) {
3013 if (*four_bytes > 0xFFFF)
3014 ++num_surrogates;
3015 }
3016
3017 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3018 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3019 if (!_PyUnicode_WSTR(u)) {
3020 PyErr_NoMemory();
3021 return NULL;
3022 }
3023 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3024
3025 w = _PyUnicode_WSTR(u);
3026 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3027 four_bytes = PyUnicode_4BYTE_DATA(u);
3028 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3029 if (*four_bytes > 0xFFFF) {
3030 /* encode surrogate pair in this case */
3031 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3032 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3033 }
3034 else
3035 *w = *four_bytes;
3036
3037 if (w > wchar_end) {
3038 assert(0 && "Miscalculated string end");
3039 }
3040 }
3041 *w = 0;
3042#else
3043 /* sizeof(wchar_t) == 4 */
3044 Py_FatalError("Impossible unicode object state, wstr and str "
3045 "should share memory already.");
3046 return NULL;
3047#endif
3048 }
3049 else {
3050 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3051 (_PyUnicode_LENGTH(u) + 1));
3052 if (!_PyUnicode_WSTR(u)) {
3053 PyErr_NoMemory();
3054 return NULL;
3055 }
3056 if (!PyUnicode_IS_COMPACT_ASCII(u))
3057 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3058 w = _PyUnicode_WSTR(u);
3059 wchar_end = w + _PyUnicode_LENGTH(u);
3060
3061 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3062 one_byte = PyUnicode_1BYTE_DATA(u);
3063 for (; w < wchar_end; ++one_byte, ++w)
3064 *w = *one_byte;
3065 /* null-terminate the wstr */
3066 *w = 0;
3067 }
3068 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3069#if SIZEOF_WCHAR_T == 4
3070 two_bytes = PyUnicode_2BYTE_DATA(u);
3071 for (; w < wchar_end; ++two_bytes, ++w)
3072 *w = *two_bytes;
3073 /* null-terminate the wstr */
3074 *w = 0;
3075#else
3076 /* sizeof(wchar_t) == 2 */
3077 PyObject_FREE(_PyUnicode_WSTR(u));
3078 _PyUnicode_WSTR(u) = NULL;
3079 Py_FatalError("Impossible unicode object state, wstr "
3080 "and str should share memory already.");
3081 return NULL;
3082#endif
3083 }
3084 else {
3085 assert(0 && "This should never happen.");
3086 }
3087 }
3088 }
3089 if (size != NULL)
3090 *size = PyUnicode_WSTR_LENGTH(u);
3091 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003092}
3093
Alexander Belopolsky40018472011-02-26 01:02:56 +00003094Py_UNICODE *
3095PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003097 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003098}
3099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003100
Alexander Belopolsky40018472011-02-26 01:02:56 +00003101Py_ssize_t
3102PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103{
3104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
3106 goto onError;
3107 }
3108 return PyUnicode_GET_SIZE(unicode);
3109
Benjamin Peterson29060642009-01-31 22:14:21 +00003110 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 return -1;
3112}
3113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003114Py_ssize_t
3115PyUnicode_GetLength(PyObject *unicode)
3116{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003117 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 PyErr_BadArgument();
3119 return -1;
3120 }
3121
3122 return PyUnicode_GET_LENGTH(unicode);
3123}
3124
3125Py_UCS4
3126PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3127{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003128 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3129 PyErr_BadArgument();
3130 return (Py_UCS4)-1;
3131 }
3132 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3133 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003134 return (Py_UCS4)-1;
3135 }
3136 return PyUnicode_READ_CHAR(unicode, index);
3137}
3138
3139int
3140PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3141{
3142 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003143 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144 return -1;
3145 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003146 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3147 PyErr_SetString(PyExc_IndexError, "string index out of range");
3148 return -1;
3149 }
3150 if (_PyUnicode_Dirty(unicode))
3151 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3153 index, ch);
3154 return 0;
3155}
3156
Alexander Belopolsky40018472011-02-26 01:02:56 +00003157const char *
3158PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003159{
Victor Stinner42cb4622010-09-01 19:39:01 +00003160 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003161}
3162
Victor Stinner554f3f02010-06-16 23:33:54 +00003163/* create or adjust a UnicodeDecodeError */
3164static void
3165make_decode_exception(PyObject **exceptionObject,
3166 const char *encoding,
3167 const char *input, Py_ssize_t length,
3168 Py_ssize_t startpos, Py_ssize_t endpos,
3169 const char *reason)
3170{
3171 if (*exceptionObject == NULL) {
3172 *exceptionObject = PyUnicodeDecodeError_Create(
3173 encoding, input, length, startpos, endpos, reason);
3174 }
3175 else {
3176 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3177 goto onError;
3178 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3179 goto onError;
3180 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3181 goto onError;
3182 }
3183 return;
3184
3185onError:
3186 Py_DECREF(*exceptionObject);
3187 *exceptionObject = NULL;
3188}
3189
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003190/* error handling callback helper:
3191 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003192 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193 and adjust various state variables.
3194 return 0 on success, -1 on error
3195*/
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197static int
3198unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003199 const char *encoding, const char *reason,
3200 const char **input, const char **inend, Py_ssize_t *startinpos,
3201 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3202 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003203{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003204 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205
3206 PyObject *restuple = NULL;
3207 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003208 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003209 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003210 Py_ssize_t requiredsize;
3211 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003213 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003214 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003215 int res = -1;
3216
3217 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 *errorHandler = PyCodec_LookupError(errors);
3219 if (*errorHandler == NULL)
3220 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 }
3222
Victor Stinner554f3f02010-06-16 23:33:54 +00003223 make_decode_exception(exceptionObject,
3224 encoding,
3225 *input, *inend - *input,
3226 *startinpos, *endinpos,
3227 reason);
3228 if (*exceptionObject == NULL)
3229 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230
3231 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3232 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003235 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003237 }
3238 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003240
3241 /* Copy back the bytes variables, which might have been modified by the
3242 callback */
3243 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3244 if (!inputobj)
3245 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003246 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003248 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003249 *input = PyBytes_AS_STRING(inputobj);
3250 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003251 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003252 /* we can DECREF safely, as the exception has another reference,
3253 so the object won't go away. */
3254 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003258 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003259 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3260 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003261 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262
3263 /* need more space? (at least enough for what we
3264 have+the replacement+the rest of the string (starting
3265 at the new input position), so we won't have to check space
3266 when there are no errors in the rest of the string) */
3267 repptr = PyUnicode_AS_UNICODE(repunicode);
3268 repsize = PyUnicode_GET_SIZE(repunicode);
3269 requiredsize = *outpos + repsize + insize-newpos;
3270 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003271 if (requiredsize<2*outsize)
3272 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003273 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 goto onError;
3275 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 }
3277 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003278 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279 Py_UNICODE_COPY(*outptr, repptr, repsize);
3280 *outptr += repsize;
3281 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 /* we made it! */
3284 res = 0;
3285
Benjamin Peterson29060642009-01-31 22:14:21 +00003286 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 Py_XDECREF(restuple);
3288 return res;
3289}
3290
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003291/* --- UTF-7 Codec -------------------------------------------------------- */
3292
Antoine Pitrou244651a2009-05-04 18:56:13 +00003293/* See RFC2152 for details. We encode conservatively and decode liberally. */
3294
3295/* Three simple macros defining base-64. */
3296
3297/* Is c a base-64 character? */
3298
3299#define IS_BASE64(c) \
3300 (((c) >= 'A' && (c) <= 'Z') || \
3301 ((c) >= 'a' && (c) <= 'z') || \
3302 ((c) >= '0' && (c) <= '9') || \
3303 (c) == '+' || (c) == '/')
3304
3305/* given that c is a base-64 character, what is its base-64 value? */
3306
3307#define FROM_BASE64(c) \
3308 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3309 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3310 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3311 (c) == '+' ? 62 : 63)
3312
3313/* What is the base-64 character of the bottom 6 bits of n? */
3314
3315#define TO_BASE64(n) \
3316 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3317
3318/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3319 * decoded as itself. We are permissive on decoding; the only ASCII
3320 * byte not decoding to itself is the + which begins a base64
3321 * string. */
3322
3323#define DECODE_DIRECT(c) \
3324 ((c) <= 127 && (c) != '+')
3325
3326/* The UTF-7 encoder treats ASCII characters differently according to
3327 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3328 * the above). See RFC2152. This array identifies these different
3329 * sets:
3330 * 0 : "Set D"
3331 * alphanumeric and '(),-./:?
3332 * 1 : "Set O"
3333 * !"#$%&*;<=>@[]^_`{|}
3334 * 2 : "whitespace"
3335 * ht nl cr sp
3336 * 3 : special (must be base64 encoded)
3337 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3338 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003339
Tim Petersced69f82003-09-16 20:30:58 +00003340static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003341char utf7_category[128] = {
3342/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3343 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3344/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3345 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3346/* sp ! " # $ % & ' ( ) * + , - . / */
3347 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3348/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3350/* @ A B C D E F G H I J K L M N O */
3351 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3352/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3353 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3354/* ` a b c d e f g h i j k l m n o */
3355 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3356/* p q r s t u v w x y z { | } ~ del */
3357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003358};
3359
Antoine Pitrou244651a2009-05-04 18:56:13 +00003360/* ENCODE_DIRECT: this character should be encoded as itself. The
3361 * answer depends on whether we are encoding set O as itself, and also
3362 * on whether we are encoding whitespace as itself. RFC2152 makes it
3363 * clear that the answers to these questions vary between
3364 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003365
Antoine Pitrou244651a2009-05-04 18:56:13 +00003366#define ENCODE_DIRECT(c, directO, directWS) \
3367 ((c) < 128 && (c) > 0 && \
3368 ((utf7_category[(c)] == 0) || \
3369 (directWS && (utf7_category[(c)] == 2)) || \
3370 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003371
Alexander Belopolsky40018472011-02-26 01:02:56 +00003372PyObject *
3373PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003374 Py_ssize_t size,
3375 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003376{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003377 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3378}
3379
Antoine Pitrou244651a2009-05-04 18:56:13 +00003380/* The decoder. The only state we preserve is our read position,
3381 * i.e. how many characters we have consumed. So if we end in the
3382 * middle of a shift sequence we have to back off the read position
3383 * and the output to the beginning of the sequence, otherwise we lose
3384 * all the shift state (seen bits, number of bits seen, high
3385 * surrogate). */
3386
Alexander Belopolsky40018472011-02-26 01:02:56 +00003387PyObject *
3388PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003389 Py_ssize_t size,
3390 const char *errors,
3391 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003392{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003394 Py_ssize_t startinpos;
3395 Py_ssize_t endinpos;
3396 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003397 const char *e;
3398 PyUnicodeObject *unicode;
3399 Py_UNICODE *p;
3400 const char *errmsg = "";
3401 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003402 Py_UNICODE *shiftOutStart;
3403 unsigned int base64bits = 0;
3404 unsigned long base64buffer = 0;
3405 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003406 PyObject *errorHandler = NULL;
3407 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003408
3409 unicode = _PyUnicode_New(size);
3410 if (!unicode)
3411 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003412 if (size == 0) {
3413 if (consumed)
3414 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003415 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003418 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003419 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003420 e = s + size;
3421
3422 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003425 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003426
Antoine Pitrou244651a2009-05-04 18:56:13 +00003427 if (inShift) { /* in a base-64 section */
3428 if (IS_BASE64(ch)) { /* consume a base-64 character */
3429 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3430 base64bits += 6;
3431 s++;
3432 if (base64bits >= 16) {
3433 /* we have enough bits for a UTF-16 value */
3434 Py_UNICODE outCh = (Py_UNICODE)
3435 (base64buffer >> (base64bits-16));
3436 base64bits -= 16;
3437 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3438 if (surrogate) {
3439 /* expecting a second surrogate */
3440 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3441#ifdef Py_UNICODE_WIDE
3442 *p++ = (((surrogate & 0x3FF)<<10)
3443 | (outCh & 0x3FF)) + 0x10000;
3444#else
3445 *p++ = surrogate;
3446 *p++ = outCh;
3447#endif
3448 surrogate = 0;
3449 }
3450 else {
3451 surrogate = 0;
3452 errmsg = "second surrogate missing";
3453 goto utf7Error;
3454 }
3455 }
3456 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3457 /* first surrogate */
3458 surrogate = outCh;
3459 }
3460 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3461 errmsg = "unexpected second surrogate";
3462 goto utf7Error;
3463 }
3464 else {
3465 *p++ = outCh;
3466 }
3467 }
3468 }
3469 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003470 inShift = 0;
3471 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003472 if (surrogate) {
3473 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003474 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003475 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003476 if (base64bits > 0) { /* left-over bits */
3477 if (base64bits >= 6) {
3478 /* We've seen at least one base-64 character */
3479 errmsg = "partial character in shift sequence";
3480 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003481 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003482 else {
3483 /* Some bits remain; they should be zero */
3484 if (base64buffer != 0) {
3485 errmsg = "non-zero padding bits in shift sequence";
3486 goto utf7Error;
3487 }
3488 }
3489 }
3490 if (ch != '-') {
3491 /* '-' is absorbed; other terminating
3492 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003493 *p++ = ch;
3494 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003495 }
3496 }
3497 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003499 s++; /* consume '+' */
3500 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003501 s++;
3502 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003503 }
3504 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003505 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003506 shiftOutStart = p;
3507 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003508 }
3509 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003510 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003511 *p++ = ch;
3512 s++;
3513 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003514 else {
3515 startinpos = s-starts;
3516 s++;
3517 errmsg = "unexpected special character";
3518 goto utf7Error;
3519 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003520 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003521utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 outpos = p-PyUnicode_AS_UNICODE(unicode);
3523 endinpos = s-starts;
3524 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003525 errors, &errorHandler,
3526 "utf7", errmsg,
3527 &starts, &e, &startinpos, &endinpos, &exc, &s,
3528 &unicode, &outpos, &p))
3529 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003530 }
3531
Antoine Pitrou244651a2009-05-04 18:56:13 +00003532 /* end of string */
3533
3534 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3535 /* if we're in an inconsistent state, that's an error */
3536 if (surrogate ||
3537 (base64bits >= 6) ||
3538 (base64bits > 0 && base64buffer != 0)) {
3539 outpos = p-PyUnicode_AS_UNICODE(unicode);
3540 endinpos = size;
3541 if (unicode_decode_call_errorhandler(
3542 errors, &errorHandler,
3543 "utf7", "unterminated shift sequence",
3544 &starts, &e, &startinpos, &endinpos, &exc, &s,
3545 &unicode, &outpos, &p))
3546 goto onError;
3547 if (s < e)
3548 goto restart;
3549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003550 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003551
3552 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003553 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003554 if (inShift) {
3555 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003556 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003557 }
3558 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003559 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003560 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003561 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003562
Victor Stinnerfe226c02011-10-03 03:52:20 +02003563 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003564 goto onError;
3565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566 Py_XDECREF(errorHandler);
3567 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003568 if (PyUnicode_READY(unicode) == -1) {
3569 Py_DECREF(unicode);
3570 return NULL;
3571 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003572 return (PyObject *)unicode;
3573
Benjamin Peterson29060642009-01-31 22:14:21 +00003574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 Py_XDECREF(errorHandler);
3576 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003577 Py_DECREF(unicode);
3578 return NULL;
3579}
3580
3581
Alexander Belopolsky40018472011-02-26 01:02:56 +00003582PyObject *
3583PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003584 Py_ssize_t size,
3585 int base64SetO,
3586 int base64WhiteSpace,
3587 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003588{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003589 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003590 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003591 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003592 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003593 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003594 unsigned int base64bits = 0;
3595 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596 char * out;
3597 char * start;
3598
3599 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003600 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003601
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003602 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003603 return PyErr_NoMemory();
3604
Antoine Pitrou244651a2009-05-04 18:56:13 +00003605 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606 if (v == NULL)
3607 return NULL;
3608
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003609 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003610 for (;i < size; ++i) {
3611 Py_UNICODE ch = s[i];
3612
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613 if (inShift) {
3614 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3615 /* shifting out */
3616 if (base64bits) { /* output remaining bits */
3617 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3618 base64buffer = 0;
3619 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 }
3621 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003622 /* Characters not in the BASE64 set implicitly unshift the sequence
3623 so no '-' is required, except if the character is itself a '-' */
3624 if (IS_BASE64(ch) || ch == '-') {
3625 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003626 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003627 *out++ = (char) ch;
3628 }
3629 else {
3630 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003631 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003632 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003633 else { /* not in a shift sequence */
3634 if (ch == '+') {
3635 *out++ = '+';
3636 *out++ = '-';
3637 }
3638 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3639 *out++ = (char) ch;
3640 }
3641 else {
3642 *out++ = '+';
3643 inShift = 1;
3644 goto encode_char;
3645 }
3646 }
3647 continue;
3648encode_char:
3649#ifdef Py_UNICODE_WIDE
3650 if (ch >= 0x10000) {
3651 /* code first surrogate */
3652 base64bits += 16;
3653 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3654 while (base64bits >= 6) {
3655 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3656 base64bits -= 6;
3657 }
3658 /* prepare second surrogate */
3659 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3660 }
3661#endif
3662 base64bits += 16;
3663 base64buffer = (base64buffer << 16) | ch;
3664 while (base64bits >= 6) {
3665 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3666 base64bits -= 6;
3667 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003668 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 if (base64bits)
3670 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3671 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003673 if (_PyBytes_Resize(&v, out - start) < 0)
3674 return NULL;
3675 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003676}
3677
Antoine Pitrou244651a2009-05-04 18:56:13 +00003678#undef IS_BASE64
3679#undef FROM_BASE64
3680#undef TO_BASE64
3681#undef DECODE_DIRECT
3682#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003683
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684/* --- UTF-8 Codec -------------------------------------------------------- */
3685
Tim Petersced69f82003-09-16 20:30:58 +00003686static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003687char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003688 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3689 illegal prefix. See RFC 3629 for details */
3690 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3691 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3694 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3695 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3696 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003697 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3702 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3703 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3704 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3705 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706};
3707
Alexander Belopolsky40018472011-02-26 01:02:56 +00003708PyObject *
3709PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003710 Py_ssize_t size,
3711 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712{
Walter Dörwald69652032004-09-07 20:24:22 +00003713 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3714}
3715
Antoine Pitrouab868312009-01-10 15:40:25 +00003716/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3717#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3718
3719/* Mask to quickly check whether a C 'long' contains a
3720 non-ASCII, UTF8-encoded char. */
3721#if (SIZEOF_LONG == 8)
3722# define ASCII_CHAR_MASK 0x8080808080808080L
3723#elif (SIZEOF_LONG == 4)
3724# define ASCII_CHAR_MASK 0x80808080L
3725#else
3726# error C 'long' size should be either 4 or 8!
3727#endif
3728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729/* Scans a UTF-8 string and returns the maximum character to be expected,
3730 the size of the decoded unicode string and if any major errors were
3731 encountered.
3732
3733 This function does check basic UTF-8 sanity, it does however NOT CHECK
3734 if the string contains surrogates, and if all continuation bytes are
3735 within the correct ranges, these checks are performed in
3736 PyUnicode_DecodeUTF8Stateful.
3737
3738 If it sets has_errors to 1, it means the value of unicode_size and max_char
3739 will be bogus and you should not rely on useful information in them.
3740 */
3741static Py_UCS4
3742utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3743 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3744 int *has_errors)
3745{
3746 Py_ssize_t n;
3747 Py_ssize_t char_count = 0;
3748 Py_UCS4 max_char = 127, new_max;
3749 Py_UCS4 upper_bound;
3750 const unsigned char *p = (const unsigned char *)s;
3751 const unsigned char *end = p + string_size;
3752 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3753 int err = 0;
3754
3755 for (; p < end && !err; ++p, ++char_count) {
3756 /* Only check value if it's not a ASCII char... */
3757 if (*p < 0x80) {
3758 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3759 an explanation. */
3760 if (!((size_t) p & LONG_PTR_MASK)) {
3761 /* Help register allocation */
3762 register const unsigned char *_p = p;
3763 while (_p < aligned_end) {
3764 unsigned long value = *(unsigned long *) _p;
3765 if (value & ASCII_CHAR_MASK)
3766 break;
3767 _p += SIZEOF_LONG;
3768 char_count += SIZEOF_LONG;
3769 }
3770 p = _p;
3771 if (p == end)
3772 break;
3773 }
3774 }
3775 if (*p >= 0x80) {
3776 n = utf8_code_length[*p];
3777 new_max = max_char;
3778 switch (n) {
3779 /* invalid start byte */
3780 case 0:
3781 err = 1;
3782 break;
3783 case 2:
3784 /* Code points between 0x00FF and 0x07FF inclusive.
3785 Approximate the upper bound of the code point,
3786 if this flips over 255 we can be sure it will be more
3787 than 255 and the string will need 2 bytes per code coint,
3788 if it stays under or equal to 255, we can be sure 1 byte
3789 is enough.
3790 ((*p & 0b00011111) << 6) | 0b00111111 */
3791 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3792 if (max_char < upper_bound)
3793 new_max = upper_bound;
3794 /* Ensure we track at least that we left ASCII space. */
3795 if (new_max < 128)
3796 new_max = 128;
3797 break;
3798 case 3:
3799 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3800 always > 255 and <= 65535 and will always need 2 bytes. */
3801 if (max_char < 65535)
3802 new_max = 65535;
3803 break;
3804 case 4:
3805 /* Code point will be above 0xFFFF for sure in this case. */
3806 new_max = 65537;
3807 break;
3808 /* Internal error, this should be caught by the first if */
3809 case 1:
3810 default:
3811 assert(0 && "Impossible case in utf8_max_char_and_size");
3812 err = 1;
3813 }
3814 /* Instead of number of overall bytes for this code point,
3815 n containts the number of following bytes: */
3816 --n;
3817 /* Check if the follow up chars are all valid continuation bytes */
3818 if (n >= 1) {
3819 const unsigned char *cont;
3820 if ((p + n) >= end) {
3821 if (consumed == 0)
3822 /* incomplete data, non-incremental decoding */
3823 err = 1;
3824 break;
3825 }
3826 for (cont = p + 1; cont < (p + n); ++cont) {
3827 if ((*cont & 0xc0) != 0x80) {
3828 err = 1;
3829 break;
3830 }
3831 }
3832 p += n;
3833 }
3834 else
3835 err = 1;
3836 max_char = new_max;
3837 }
3838 }
3839
3840 if (unicode_size)
3841 *unicode_size = char_count;
3842 if (has_errors)
3843 *has_errors = err;
3844 return max_char;
3845}
3846
3847/* Similar to PyUnicode_WRITE but can also write into wstr field
3848 of the legacy unicode representation */
3849#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3850 do { \
3851 const int k_ = (kind); \
3852 if (k_ == PyUnicode_WCHAR_KIND) \
3853 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3854 else if (k_ == PyUnicode_1BYTE_KIND) \
3855 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3856 else if (k_ == PyUnicode_2BYTE_KIND) \
3857 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3858 else \
3859 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3860 } while (0)
3861
Alexander Belopolsky40018472011-02-26 01:02:56 +00003862PyObject *
3863PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003864 Py_ssize_t size,
3865 const char *errors,
3866 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003870 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003871 Py_ssize_t startinpos;
3872 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003873 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003875 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 PyObject *errorHandler = NULL;
3877 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003878 Py_UCS4 maxchar = 0;
3879 Py_ssize_t unicode_size;
3880 Py_ssize_t i;
3881 int kind;
3882 void *data;
3883 int has_errors;
3884 Py_UNICODE *error_outptr;
3885#if SIZEOF_WCHAR_T == 2
3886 Py_ssize_t wchar_offset = 0;
3887#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888
Walter Dörwald69652032004-09-07 20:24:22 +00003889 if (size == 0) {
3890 if (consumed)
3891 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3895 consumed, &has_errors);
3896 if (has_errors) {
3897 unicode = _PyUnicode_New(size);
3898 if (!unicode)
3899 return NULL;
3900 kind = PyUnicode_WCHAR_KIND;
3901 data = PyUnicode_AS_UNICODE(unicode);
3902 assert(data != NULL);
3903 }
3904 else {
3905 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3906 if (!unicode)
3907 return NULL;
3908 /* When the string is ASCII only, just use memcpy and return.
3909 unicode_size may be != size if there is an incomplete UTF-8
3910 sequence at the end of the ASCII block. */
3911 if (maxchar < 128 && size == unicode_size) {
3912 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3913 return (PyObject *)unicode;
3914 }
3915 kind = PyUnicode_KIND(unicode);
3916 data = PyUnicode_DATA(unicode);
3917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003921 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922
3923 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003924 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925
3926 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003927 /* Fast path for runs of ASCII characters. Given that common UTF-8
3928 input will consist of an overwhelming majority of ASCII
3929 characters, we try to optimize for this case by checking
3930 as many characters as a C 'long' can contain.
3931 First, check if we can do an aligned read, as most CPUs have
3932 a penalty for unaligned reads.
3933 */
3934 if (!((size_t) s & LONG_PTR_MASK)) {
3935 /* Help register allocation */
3936 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003938 while (_s < aligned_end) {
3939 /* Read a whole long at a time (either 4 or 8 bytes),
3940 and do a fast unrolled copy if it only contains ASCII
3941 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 unsigned long value = *(unsigned long *) _s;
3943 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003944 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3946 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3947 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3948 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003949#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3951 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3952 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3953 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003954#endif
3955 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003957 }
3958 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003960 if (s == e)
3961 break;
3962 ch = (unsigned char)*s;
3963 }
3964 }
3965
3966 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003967 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 s++;
3969 continue;
3970 }
3971
3972 n = utf8_code_length[ch];
3973
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003974 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 if (consumed)
3976 break;
3977 else {
3978 errmsg = "unexpected end of data";
3979 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003980 endinpos = startinpos+1;
3981 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3982 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 goto utf8Error;
3984 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
3987 switch (n) {
3988
3989 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003990 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 startinpos = s-starts;
3992 endinpos = startinpos+1;
3993 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994
3995 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003996 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 startinpos = s-starts;
3998 endinpos = startinpos+1;
3999 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000
4001 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004002 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004003 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004005 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 goto utf8Error;
4007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004009 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 break;
4012
4013 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004014 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4015 will result in surrogates in range d800-dfff. Surrogates are
4016 not valid UTF-8 so they are rejected.
4017 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4018 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004019 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004020 (s[2] & 0xc0) != 0x80 ||
4021 ((unsigned char)s[0] == 0xE0 &&
4022 (unsigned char)s[1] < 0xA0) ||
4023 ((unsigned char)s[0] == 0xED &&
4024 (unsigned char)s[1] > 0x9F)) {
4025 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004027 endinpos = startinpos + 1;
4028
4029 /* if s[1] first two bits are 1 and 0, then the invalid
4030 continuation byte is s[2], so increment endinpos by 1,
4031 if not, s[1] is invalid and endinpos doesn't need to
4032 be incremented. */
4033 if ((s[1] & 0xC0) == 0x80)
4034 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 goto utf8Error;
4036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004038 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004040 break;
4041
4042 case 4:
4043 if ((s[1] & 0xc0) != 0x80 ||
4044 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004045 (s[3] & 0xc0) != 0x80 ||
4046 ((unsigned char)s[0] == 0xF0 &&
4047 (unsigned char)s[1] < 0x90) ||
4048 ((unsigned char)s[0] == 0xF4 &&
4049 (unsigned char)s[1] > 0x8F)) {
4050 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004051 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004052 endinpos = startinpos + 1;
4053 if ((s[1] & 0xC0) == 0x80) {
4054 endinpos++;
4055 if ((s[2] & 0xC0) == 0x80)
4056 endinpos++;
4057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004058 goto utf8Error;
4059 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004060 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004061 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4062 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* If the string is flexible or we have native UCS-4, write
4065 directly.. */
4066 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4067 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 else {
4070 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 /* translate from 10000..10FFFF to 0..FFFF */
4073 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 /* high surrogate = top 10 bits added to D800 */
4076 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4077 (Py_UNICODE)(0xD800 + (ch >> 10)));
4078
4079 /* low surrogate = bottom 10 bits added to DC00 */
4080 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4081 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4082 }
4083#if SIZEOF_WCHAR_T == 2
4084 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004085#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 }
4088 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004090
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004092 /* If this is not yet a resizable string, make it one.. */
4093 if (kind != PyUnicode_WCHAR_KIND) {
4094 const Py_UNICODE *u;
4095 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4096 if (!new_unicode)
4097 goto onError;
4098 u = PyUnicode_AsUnicode((PyObject *)unicode);
4099 if (!u)
4100 goto onError;
4101#if SIZEOF_WCHAR_T == 2
4102 i += wchar_offset;
4103#endif
4104 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4105 Py_DECREF(unicode);
4106 unicode = new_unicode;
4107 kind = 0;
4108 data = PyUnicode_AS_UNICODE(new_unicode);
4109 assert(data != NULL);
4110 }
4111 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004112 if (unicode_decode_call_errorhandler(
4113 errors, &errorHandler,
4114 "utf8", errmsg,
4115 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004117 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 /* Update data because unicode_decode_call_errorhandler might have
4119 re-created or resized the unicode object. */
4120 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004121 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 /* Ensure the unicode_size calculation above was correct: */
4124 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4125
Walter Dörwald69652032004-09-07 20:24:22 +00004126 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004127 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129 /* Adjust length and ready string when it contained errors and
4130 is of the old resizable kind. */
4131 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004132 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 PyUnicode_READY(unicode) == -1)
4134 goto onError;
4135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 Py_XDECREF(errorHandler);
4138 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139 if (PyUnicode_READY(unicode) == -1) {
4140 Py_DECREF(unicode);
4141 return NULL;
4142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 return (PyObject *)unicode;
4144
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 Py_XDECREF(errorHandler);
4147 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 Py_DECREF(unicode);
4149 return NULL;
4150}
4151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004153
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004154#ifdef __APPLE__
4155
4156/* Simplified UTF-8 decoder using surrogateescape error handler,
4157 used to decode the command line arguments on Mac OS X. */
4158
4159wchar_t*
4160_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4161{
4162 int n;
4163 const char *e;
4164 wchar_t *unicode, *p;
4165
4166 /* Note: size will always be longer than the resulting Unicode
4167 character count */
4168 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4169 PyErr_NoMemory();
4170 return NULL;
4171 }
4172 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4173 if (!unicode)
4174 return NULL;
4175
4176 /* Unpack UTF-8 encoded data */
4177 p = unicode;
4178 e = s + size;
4179 while (s < e) {
4180 Py_UCS4 ch = (unsigned char)*s;
4181
4182 if (ch < 0x80) {
4183 *p++ = (wchar_t)ch;
4184 s++;
4185 continue;
4186 }
4187
4188 n = utf8_code_length[ch];
4189 if (s + n > e) {
4190 goto surrogateescape;
4191 }
4192
4193 switch (n) {
4194 case 0:
4195 case 1:
4196 goto surrogateescape;
4197
4198 case 2:
4199 if ((s[1] & 0xc0) != 0x80)
4200 goto surrogateescape;
4201 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4202 assert ((ch > 0x007F) && (ch <= 0x07FF));
4203 *p++ = (wchar_t)ch;
4204 break;
4205
4206 case 3:
4207 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4208 will result in surrogates in range d800-dfff. Surrogates are
4209 not valid UTF-8 so they are rejected.
4210 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4211 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4212 if ((s[1] & 0xc0) != 0x80 ||
4213 (s[2] & 0xc0) != 0x80 ||
4214 ((unsigned char)s[0] == 0xE0 &&
4215 (unsigned char)s[1] < 0xA0) ||
4216 ((unsigned char)s[0] == 0xED &&
4217 (unsigned char)s[1] > 0x9F)) {
4218
4219 goto surrogateescape;
4220 }
4221 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4222 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004224 break;
4225
4226 case 4:
4227 if ((s[1] & 0xc0) != 0x80 ||
4228 (s[2] & 0xc0) != 0x80 ||
4229 (s[3] & 0xc0) != 0x80 ||
4230 ((unsigned char)s[0] == 0xF0 &&
4231 (unsigned char)s[1] < 0x90) ||
4232 ((unsigned char)s[0] == 0xF4 &&
4233 (unsigned char)s[1] > 0x8F)) {
4234 goto surrogateescape;
4235 }
4236 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4237 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4238 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4239
4240#if SIZEOF_WCHAR_T == 4
4241 *p++ = (wchar_t)ch;
4242#else
4243 /* compute and append the two surrogates: */
4244
4245 /* translate from 10000..10FFFF to 0..FFFF */
4246 ch -= 0x10000;
4247
4248 /* high surrogate = top 10 bits added to D800 */
4249 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4250
4251 /* low surrogate = bottom 10 bits added to DC00 */
4252 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4253#endif
4254 break;
4255 }
4256 s += n;
4257 continue;
4258
4259 surrogateescape:
4260 *p++ = 0xDC00 + ch;
4261 s++;
4262 }
4263 *p = L'\0';
4264 return unicode;
4265}
4266
4267#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004269/* Primary internal function which creates utf8 encoded bytes objects.
4270
4271 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004272 and allocate exactly as much space needed at the end. Else allocate the
4273 maximum possible needed (4 result bytes per Unicode character), and return
4274 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004275*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004276PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278{
Tim Peters602f7402002-04-27 18:03:26 +00004279#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004280
Guido van Rossum98297ee2007-11-06 21:34:58 +00004281 Py_ssize_t i; /* index into s of next input byte */
4282 PyObject *result; /* result string object */
4283 char *p; /* next free byte in output buffer */
4284 Py_ssize_t nallocated; /* number of result bytes allocated */
4285 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004286 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004287 PyObject *errorHandler = NULL;
4288 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289 int kind;
4290 void *data;
4291 Py_ssize_t size;
4292 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4293#if SIZEOF_WCHAR_T == 2
4294 Py_ssize_t wchar_offset = 0;
4295#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297 if (!PyUnicode_Check(unicode)) {
4298 PyErr_BadArgument();
4299 return NULL;
4300 }
4301
4302 if (PyUnicode_READY(unicode) == -1)
4303 return NULL;
4304
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004305 if (PyUnicode_UTF8(unicode))
4306 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4307 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004308
4309 kind = PyUnicode_KIND(unicode);
4310 data = PyUnicode_DATA(unicode);
4311 size = PyUnicode_GET_LENGTH(unicode);
4312
Tim Peters602f7402002-04-27 18:03:26 +00004313 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314
Tim Peters602f7402002-04-27 18:03:26 +00004315 if (size <= MAX_SHORT_UNICHARS) {
4316 /* Write into the stack buffer; nallocated can't overflow.
4317 * At the end, we'll allocate exactly as much heap space as it
4318 * turns out we need.
4319 */
4320 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004321 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004322 p = stackbuf;
4323 }
4324 else {
4325 /* Overallocate on the heap, and give the excess back at the end. */
4326 nallocated = size * 4;
4327 if (nallocated / 4 != size) /* overflow! */
4328 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004329 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004330 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004331 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004332 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004333 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004334
Tim Peters602f7402002-04-27 18:03:26 +00004335 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004337
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004338 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004339 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004341
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004343 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004344 *p++ = (char)(0xc0 | (ch >> 6));
4345 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004346 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004347 Py_ssize_t newpos;
4348 PyObject *rep;
4349 Py_ssize_t repsize, k, startpos;
4350 startpos = i-1;
4351#if SIZEOF_WCHAR_T == 2
4352 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004353#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004354 rep = unicode_encode_call_errorhandler(
4355 errors, &errorHandler, "utf-8", "surrogates not allowed",
4356 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4357 &exc, startpos, startpos+1, &newpos);
4358 if (!rep)
4359 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004361 if (PyBytes_Check(rep))
4362 repsize = PyBytes_GET_SIZE(rep);
4363 else
4364 repsize = PyUnicode_GET_SIZE(rep);
4365
4366 if (repsize > 4) {
4367 Py_ssize_t offset;
4368
4369 if (result == NULL)
4370 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004371 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004372 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004374 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4375 /* integer overflow */
4376 PyErr_NoMemory();
4377 goto error;
4378 }
4379 nallocated += repsize - 4;
4380 if (result != NULL) {
4381 if (_PyBytes_Resize(&result, nallocated) < 0)
4382 goto error;
4383 } else {
4384 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004385 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004386 goto error;
4387 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4388 }
4389 p = PyBytes_AS_STRING(result) + offset;
4390 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 if (PyBytes_Check(rep)) {
4393 char *prep = PyBytes_AS_STRING(rep);
4394 for(k = repsize; k > 0; k--)
4395 *p++ = *prep++;
4396 } else /* rep is unicode */ {
4397 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4398 Py_UNICODE c;
4399
4400 for(k=0; k<repsize; k++) {
4401 c = prep[k];
4402 if (0x80 <= c) {
4403 raise_encode_exception(&exc, "utf-8",
4404 PyUnicode_AS_UNICODE(unicode),
4405 size, i-1, i,
4406 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004407 goto error;
4408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004410 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004412 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004413 } else if (ch < 0x10000) {
4414 *p++ = (char)(0xe0 | (ch >> 12));
4415 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4416 *p++ = (char)(0x80 | (ch & 0x3f));
4417 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004418 /* Encode UCS4 Unicode ordinals */
4419 *p++ = (char)(0xf0 | (ch >> 18));
4420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4422 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004423#if SIZEOF_WCHAR_T == 2
4424 wchar_offset++;
4425#endif
Tim Peters602f7402002-04-27 18:03:26 +00004426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004428
Guido van Rossum98297ee2007-11-06 21:34:58 +00004429 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004430 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004431 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004432 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004433 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004434 }
4435 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004436 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004437 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004438 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004439 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004444 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004445 error:
4446 Py_XDECREF(errorHandler);
4447 Py_XDECREF(exc);
4448 Py_XDECREF(result);
4449 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004450
Tim Peters602f7402002-04-27 18:03:26 +00004451#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452}
4453
Alexander Belopolsky40018472011-02-26 01:02:56 +00004454PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4456 Py_ssize_t size,
4457 const char *errors)
4458{
4459 PyObject *v, *unicode;
4460
4461 unicode = PyUnicode_FromUnicode(s, size);
4462 if (unicode == NULL)
4463 return NULL;
4464 v = _PyUnicode_AsUTF8String(unicode, errors);
4465 Py_DECREF(unicode);
4466 return v;
4467}
4468
4469PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004470PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473}
4474
Walter Dörwald41980ca2007-08-16 21:55:45 +00004475/* --- UTF-32 Codec ------------------------------------------------------- */
4476
4477PyObject *
4478PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 Py_ssize_t size,
4480 const char *errors,
4481 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004482{
4483 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4484}
4485
4486PyObject *
4487PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 Py_ssize_t size,
4489 const char *errors,
4490 int *byteorder,
4491 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004492{
4493 const char *starts = s;
4494 Py_ssize_t startinpos;
4495 Py_ssize_t endinpos;
4496 Py_ssize_t outpos;
4497 PyUnicodeObject *unicode;
4498 Py_UNICODE *p;
4499#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004500 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004501 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004502#else
4503 const int pairs = 0;
4504#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004505 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004506 int bo = 0; /* assume native ordering by default */
4507 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004508 /* Offsets from q for retrieving bytes in the right order. */
4509#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4510 int iorder[] = {0, 1, 2, 3};
4511#else
4512 int iorder[] = {3, 2, 1, 0};
4513#endif
4514 PyObject *errorHandler = NULL;
4515 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004516
Walter Dörwald41980ca2007-08-16 21:55:45 +00004517 q = (unsigned char *)s;
4518 e = q + size;
4519
4520 if (byteorder)
4521 bo = *byteorder;
4522
4523 /* Check for BOM marks (U+FEFF) in the input and adjust current
4524 byte order setting accordingly. In native mode, the leading BOM
4525 mark is skipped, in all other modes, it is copied to the output
4526 stream as-is (giving a ZWNBSP character). */
4527 if (bo == 0) {
4528 if (size >= 4) {
4529 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004531#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 if (bom == 0x0000FEFF) {
4533 q += 4;
4534 bo = -1;
4535 }
4536 else if (bom == 0xFFFE0000) {
4537 q += 4;
4538 bo = 1;
4539 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004540#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 if (bom == 0x0000FEFF) {
4542 q += 4;
4543 bo = 1;
4544 }
4545 else if (bom == 0xFFFE0000) {
4546 q += 4;
4547 bo = -1;
4548 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004549#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004551 }
4552
4553 if (bo == -1) {
4554 /* force LE */
4555 iorder[0] = 0;
4556 iorder[1] = 1;
4557 iorder[2] = 2;
4558 iorder[3] = 3;
4559 }
4560 else if (bo == 1) {
4561 /* force BE */
4562 iorder[0] = 3;
4563 iorder[1] = 2;
4564 iorder[2] = 1;
4565 iorder[3] = 0;
4566 }
4567
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004568 /* On narrow builds we split characters outside the BMP into two
4569 codepoints => count how much extra space we need. */
4570#ifndef Py_UNICODE_WIDE
4571 for (qq = q; qq < e; qq += 4)
4572 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4573 pairs++;
4574#endif
4575
4576 /* This might be one to much, because of a BOM */
4577 unicode = _PyUnicode_New((size+3)/4+pairs);
4578 if (!unicode)
4579 return NULL;
4580 if (size == 0)
4581 return (PyObject *)unicode;
4582
4583 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004584 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004585
Walter Dörwald41980ca2007-08-16 21:55:45 +00004586 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 Py_UCS4 ch;
4588 /* remaining bytes at the end? (size should be divisible by 4) */
4589 if (e-q<4) {
4590 if (consumed)
4591 break;
4592 errmsg = "truncated data";
4593 startinpos = ((const char *)q)-starts;
4594 endinpos = ((const char *)e)-starts;
4595 goto utf32Error;
4596 /* The remaining input chars are ignored if the callback
4597 chooses to skip the input */
4598 }
4599 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4600 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004601
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 if (ch >= 0x110000)
4603 {
4604 errmsg = "codepoint not in range(0x110000)";
4605 startinpos = ((const char *)q)-starts;
4606 endinpos = startinpos+4;
4607 goto utf32Error;
4608 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004609#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 if (ch >= 0x10000)
4611 {
4612 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4613 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4614 }
4615 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004616#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 *p++ = ch;
4618 q += 4;
4619 continue;
4620 utf32Error:
4621 outpos = p-PyUnicode_AS_UNICODE(unicode);
4622 if (unicode_decode_call_errorhandler(
4623 errors, &errorHandler,
4624 "utf32", errmsg,
4625 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4626 &unicode, &outpos, &p))
4627 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004628 }
4629
4630 if (byteorder)
4631 *byteorder = bo;
4632
4633 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635
4636 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004637 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004638 goto onError;
4639
4640 Py_XDECREF(errorHandler);
4641 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004642 if (PyUnicode_READY(unicode) == -1) {
4643 Py_DECREF(unicode);
4644 return NULL;
4645 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004646 return (PyObject *)unicode;
4647
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004649 Py_DECREF(unicode);
4650 Py_XDECREF(errorHandler);
4651 Py_XDECREF(exc);
4652 return NULL;
4653}
4654
4655PyObject *
4656PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 Py_ssize_t size,
4658 const char *errors,
4659 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004660{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004661 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004662 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004663 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004664#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004665 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004666#else
4667 const int pairs = 0;
4668#endif
4669 /* Offsets from p for storing byte pairs in the right order. */
4670#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4671 int iorder[] = {0, 1, 2, 3};
4672#else
4673 int iorder[] = {3, 2, 1, 0};
4674#endif
4675
Benjamin Peterson29060642009-01-31 22:14:21 +00004676#define STORECHAR(CH) \
4677 do { \
4678 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4679 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4680 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4681 p[iorder[0]] = (CH) & 0xff; \
4682 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004683 } while(0)
4684
4685 /* In narrow builds we can output surrogate pairs as one codepoint,
4686 so we need less space. */
4687#ifndef Py_UNICODE_WIDE
4688 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4690 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4691 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004692#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004693 nsize = (size - pairs + (byteorder == 0));
4694 bytesize = nsize * 4;
4695 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004697 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004698 if (v == NULL)
4699 return NULL;
4700
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004701 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004702 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004704 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004705 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004706
4707 if (byteorder == -1) {
4708 /* force LE */
4709 iorder[0] = 0;
4710 iorder[1] = 1;
4711 iorder[2] = 2;
4712 iorder[3] = 3;
4713 }
4714 else if (byteorder == 1) {
4715 /* force BE */
4716 iorder[0] = 3;
4717 iorder[1] = 2;
4718 iorder[2] = 1;
4719 iorder[3] = 0;
4720 }
4721
4722 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004724#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4726 Py_UCS4 ch2 = *s;
4727 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4728 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4729 s++;
4730 size--;
4731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004732 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004733#endif
4734 STORECHAR(ch);
4735 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004736
4737 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004739#undef STORECHAR
4740}
4741
Alexander Belopolsky40018472011-02-26 01:02:56 +00004742PyObject *
4743PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744{
4745 if (!PyUnicode_Check(unicode)) {
4746 PyErr_BadArgument();
4747 return NULL;
4748 }
4749 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 PyUnicode_GET_SIZE(unicode),
4751 NULL,
4752 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004753}
4754
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755/* --- UTF-16 Codec ------------------------------------------------------- */
4756
Tim Peters772747b2001-08-09 22:21:55 +00004757PyObject *
4758PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004759 Py_ssize_t size,
4760 const char *errors,
4761 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Walter Dörwald69652032004-09-07 20:24:22 +00004763 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4764}
4765
Antoine Pitrouab868312009-01-10 15:40:25 +00004766/* Two masks for fast checking of whether a C 'long' may contain
4767 UTF16-encoded surrogate characters. This is an efficient heuristic,
4768 assuming that non-surrogate characters with a code point >= 0x8000 are
4769 rare in most input.
4770 FAST_CHAR_MASK is used when the input is in native byte ordering,
4771 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004772*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004773#if (SIZEOF_LONG == 8)
4774# define FAST_CHAR_MASK 0x8000800080008000L
4775# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4776#elif (SIZEOF_LONG == 4)
4777# define FAST_CHAR_MASK 0x80008000L
4778# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4779#else
4780# error C 'long' size should be either 4 or 8!
4781#endif
4782
Walter Dörwald69652032004-09-07 20:24:22 +00004783PyObject *
4784PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004785 Py_ssize_t size,
4786 const char *errors,
4787 int *byteorder,
4788 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004789{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004791 Py_ssize_t startinpos;
4792 Py_ssize_t endinpos;
4793 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 PyUnicodeObject *unicode;
4795 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004796 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004797 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004798 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004799 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004800 /* Offsets from q for retrieving byte pairs in the right order. */
4801#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4802 int ihi = 1, ilo = 0;
4803#else
4804 int ihi = 0, ilo = 1;
4805#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 PyObject *errorHandler = NULL;
4807 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
4809 /* Note: size will always be longer than the resulting Unicode
4810 character count */
4811 unicode = _PyUnicode_New(size);
4812 if (!unicode)
4813 return NULL;
4814 if (size == 0)
4815 return (PyObject *)unicode;
4816
4817 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004819 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004820 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821
4822 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004823 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004825 /* Check for BOM marks (U+FEFF) in the input and adjust current
4826 byte order setting accordingly. In native mode, the leading BOM
4827 mark is skipped, in all other modes, it is copied to the output
4828 stream as-is (giving a ZWNBSP character). */
4829 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004830 if (size >= 2) {
4831 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004832#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 if (bom == 0xFEFF) {
4834 q += 2;
4835 bo = -1;
4836 }
4837 else if (bom == 0xFFFE) {
4838 q += 2;
4839 bo = 1;
4840 }
Tim Petersced69f82003-09-16 20:30:58 +00004841#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 if (bom == 0xFEFF) {
4843 q += 2;
4844 bo = 1;
4845 }
4846 else if (bom == 0xFFFE) {
4847 q += 2;
4848 bo = -1;
4849 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004850#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004852 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853
Tim Peters772747b2001-08-09 22:21:55 +00004854 if (bo == -1) {
4855 /* force LE */
4856 ihi = 1;
4857 ilo = 0;
4858 }
4859 else if (bo == 1) {
4860 /* force BE */
4861 ihi = 0;
4862 ilo = 1;
4863 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004864#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4865 native_ordering = ilo < ihi;
4866#else
4867 native_ordering = ilo > ihi;
4868#endif
Tim Peters772747b2001-08-09 22:21:55 +00004869
Antoine Pitrouab868312009-01-10 15:40:25 +00004870 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004871 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004872 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004873 /* First check for possible aligned read of a C 'long'. Unaligned
4874 reads are more expensive, better to defer to another iteration. */
4875 if (!((size_t) q & LONG_PTR_MASK)) {
4876 /* Fast path for runs of non-surrogate chars. */
4877 register const unsigned char *_q = q;
4878 Py_UNICODE *_p = p;
4879 if (native_ordering) {
4880 /* Native ordering is simple: as long as the input cannot
4881 possibly contain a surrogate char, do an unrolled copy
4882 of several 16-bit code points to the target object.
4883 The non-surrogate check is done on several input bytes
4884 at a time (as many as a C 'long' can contain). */
4885 while (_q < aligned_end) {
4886 unsigned long data = * (unsigned long *) _q;
4887 if (data & FAST_CHAR_MASK)
4888 break;
4889 _p[0] = ((unsigned short *) _q)[0];
4890 _p[1] = ((unsigned short *) _q)[1];
4891#if (SIZEOF_LONG == 8)
4892 _p[2] = ((unsigned short *) _q)[2];
4893 _p[3] = ((unsigned short *) _q)[3];
4894#endif
4895 _q += SIZEOF_LONG;
4896 _p += SIZEOF_LONG / 2;
4897 }
4898 }
4899 else {
4900 /* Byteswapped ordering is similar, but we must decompose
4901 the copy bytewise, and take care of zero'ing out the
4902 upper bytes if the target object is in 32-bit units
4903 (that is, in UCS-4 builds). */
4904 while (_q < aligned_end) {
4905 unsigned long data = * (unsigned long *) _q;
4906 if (data & SWAPPED_FAST_CHAR_MASK)
4907 break;
4908 /* Zero upper bytes in UCS-4 builds */
4909#if (Py_UNICODE_SIZE > 2)
4910 _p[0] = 0;
4911 _p[1] = 0;
4912#if (SIZEOF_LONG == 8)
4913 _p[2] = 0;
4914 _p[3] = 0;
4915#endif
4916#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004917 /* Issue #4916; UCS-4 builds on big endian machines must
4918 fill the two last bytes of each 4-byte unit. */
4919#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4920# define OFF 2
4921#else
4922# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004923#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004924 ((unsigned char *) _p)[OFF + 1] = _q[0];
4925 ((unsigned char *) _p)[OFF + 0] = _q[1];
4926 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4927 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4928#if (SIZEOF_LONG == 8)
4929 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4930 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4931 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4932 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4933#endif
4934#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004935 _q += SIZEOF_LONG;
4936 _p += SIZEOF_LONG / 2;
4937 }
4938 }
4939 p = _p;
4940 q = _q;
4941 if (q >= e)
4942 break;
4943 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945
Benjamin Peterson14339b62009-01-31 16:36:08 +00004946 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004947
4948 if (ch < 0xD800 || ch > 0xDFFF) {
4949 *p++ = ch;
4950 continue;
4951 }
4952
4953 /* UTF-16 code pair: */
4954 if (q > e) {
4955 errmsg = "unexpected end of data";
4956 startinpos = (((const char *)q) - 2) - starts;
4957 endinpos = ((const char *)e) + 1 - starts;
4958 goto utf16Error;
4959 }
4960 if (0xD800 <= ch && ch <= 0xDBFF) {
4961 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4962 q += 2;
4963 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004964#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 *p++ = ch;
4966 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004967#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004969#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 continue;
4971 }
4972 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004973 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 startinpos = (((const char *)q)-4)-starts;
4975 endinpos = startinpos+2;
4976 goto utf16Error;
4977 }
4978
Benjamin Peterson14339b62009-01-31 16:36:08 +00004979 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 errmsg = "illegal encoding";
4981 startinpos = (((const char *)q)-2)-starts;
4982 endinpos = startinpos+2;
4983 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004984
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 utf16Error:
4986 outpos = p - PyUnicode_AS_UNICODE(unicode);
4987 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004988 errors,
4989 &errorHandler,
4990 "utf16", errmsg,
4991 &starts,
4992 (const char **)&e,
4993 &startinpos,
4994 &endinpos,
4995 &exc,
4996 (const char **)&q,
4997 &unicode,
4998 &outpos,
4999 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005002 /* remaining byte at the end? (size should be even) */
5003 if (e == q) {
5004 if (!consumed) {
5005 errmsg = "truncated data";
5006 startinpos = ((const char *)q) - starts;
5007 endinpos = ((const char *)e) + 1 - starts;
5008 outpos = p - PyUnicode_AS_UNICODE(unicode);
5009 if (unicode_decode_call_errorhandler(
5010 errors,
5011 &errorHandler,
5012 "utf16", errmsg,
5013 &starts,
5014 (const char **)&e,
5015 &startinpos,
5016 &endinpos,
5017 &exc,
5018 (const char **)&q,
5019 &unicode,
5020 &outpos,
5021 &p))
5022 goto onError;
5023 /* The remaining input chars are ignored if the callback
5024 chooses to skip the input */
5025 }
5026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027
5028 if (byteorder)
5029 *byteorder = bo;
5030
Walter Dörwald69652032004-09-07 20:24:22 +00005031 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005033
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005035 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 goto onError;
5037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 Py_XDECREF(errorHandler);
5039 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040 if (PyUnicode_READY(unicode) == -1) {
5041 Py_DECREF(unicode);
5042 return NULL;
5043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 return (PyObject *)unicode;
5045
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 Py_XDECREF(errorHandler);
5049 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 return NULL;
5051}
5052
Antoine Pitrouab868312009-01-10 15:40:25 +00005053#undef FAST_CHAR_MASK
5054#undef SWAPPED_FAST_CHAR_MASK
5055
Tim Peters772747b2001-08-09 22:21:55 +00005056PyObject *
5057PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 Py_ssize_t size,
5059 const char *errors,
5060 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005062 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005063 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005064 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005065#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005066 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005067#else
5068 const int pairs = 0;
5069#endif
Tim Peters772747b2001-08-09 22:21:55 +00005070 /* Offsets from p for storing byte pairs in the right order. */
5071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5072 int ihi = 1, ilo = 0;
5073#else
5074 int ihi = 0, ilo = 1;
5075#endif
5076
Benjamin Peterson29060642009-01-31 22:14:21 +00005077#define STORECHAR(CH) \
5078 do { \
5079 p[ihi] = ((CH) >> 8) & 0xff; \
5080 p[ilo] = (CH) & 0xff; \
5081 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005082 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005084#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005085 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 if (s[i] >= 0x10000)
5087 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005088#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005089 /* 2 * (size + pairs + (byteorder == 0)) */
5090 if (size > PY_SSIZE_T_MAX ||
5091 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005093 nsize = size + pairs + (byteorder == 0);
5094 bytesize = nsize * 2;
5095 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 if (v == NULL)
5099 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005101 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005104 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005105 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005106
5107 if (byteorder == -1) {
5108 /* force LE */
5109 ihi = 1;
5110 ilo = 0;
5111 }
5112 else if (byteorder == 1) {
5113 /* force BE */
5114 ihi = 0;
5115 ilo = 1;
5116 }
5117
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005118 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 Py_UNICODE ch = *s++;
5120 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005121#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 if (ch >= 0x10000) {
5123 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5124 ch = 0xD800 | ((ch-0x10000) >> 10);
5125 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005126#endif
Tim Peters772747b2001-08-09 22:21:55 +00005127 STORECHAR(ch);
5128 if (ch2)
5129 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005130 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005131
5132 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005133 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005134#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135}
5136
Alexander Belopolsky40018472011-02-26 01:02:56 +00005137PyObject *
5138PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139{
5140 if (!PyUnicode_Check(unicode)) {
5141 PyErr_BadArgument();
5142 return NULL;
5143 }
5144 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 PyUnicode_GET_SIZE(unicode),
5146 NULL,
5147 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148}
5149
5150/* --- Unicode Escape Codec ----------------------------------------------- */
5151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005152/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5153 if all the escapes in the string make it still a valid ASCII string.
5154 Returns -1 if any escapes were found which cause the string to
5155 pop out of ASCII range. Otherwise returns the length of the
5156 required buffer to hold the string.
5157 */
5158Py_ssize_t
5159length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5160{
5161 const unsigned char *p = (const unsigned char *)s;
5162 const unsigned char *end = p + size;
5163 Py_ssize_t length = 0;
5164
5165 if (size < 0)
5166 return -1;
5167
5168 for (; p < end; ++p) {
5169 if (*p > 127) {
5170 /* Non-ASCII */
5171 return -1;
5172 }
5173 else if (*p != '\\') {
5174 /* Normal character */
5175 ++length;
5176 }
5177 else {
5178 /* Backslash-escape, check next char */
5179 ++p;
5180 /* Escape sequence reaches till end of string or
5181 non-ASCII follow-up. */
5182 if (p >= end || *p > 127)
5183 return -1;
5184 switch (*p) {
5185 case '\n':
5186 /* backslash + \n result in zero characters */
5187 break;
5188 case '\\': case '\'': case '\"':
5189 case 'b': case 'f': case 't':
5190 case 'n': case 'r': case 'v': case 'a':
5191 ++length;
5192 break;
5193 case '0': case '1': case '2': case '3':
5194 case '4': case '5': case '6': case '7':
5195 case 'x': case 'u': case 'U': case 'N':
5196 /* these do not guarantee ASCII characters */
5197 return -1;
5198 default:
5199 /* count the backslash + the other character */
5200 length += 2;
5201 }
5202 }
5203 }
5204 return length;
5205}
5206
5207/* Similar to PyUnicode_WRITE but either write into wstr field
5208 or treat string as ASCII. */
5209#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5210 do { \
5211 if ((kind) != PyUnicode_WCHAR_KIND) \
5212 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5213 else \
5214 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5215 } while (0)
5216
5217#define WRITE_WSTR(buf, index, value) \
5218 assert(kind == PyUnicode_WCHAR_KIND), \
5219 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5220
5221
Fredrik Lundh06d12682001-01-24 07:59:11 +00005222static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005223
Alexander Belopolsky40018472011-02-26 01:02:56 +00005224PyObject *
5225PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005226 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005227 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t startinpos;
5231 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005232 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005236 char* message;
5237 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 PyObject *errorHandler = NULL;
5239 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005240 Py_ssize_t ascii_length;
5241 Py_ssize_t i;
5242 int kind;
5243 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005245 ascii_length = length_of_escaped_ascii_string(s, size);
5246
5247 /* After length_of_escaped_ascii_string() there are two alternatives,
5248 either the string is pure ASCII with named escapes like \n, etc.
5249 and we determined it's exact size (common case)
5250 or it contains \x, \u, ... escape sequences. then we create a
5251 legacy wchar string and resize it at the end of this function. */
5252 if (ascii_length >= 0) {
5253 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5254 if (!v)
5255 goto onError;
5256 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5257 kind = PyUnicode_1BYTE_KIND;
5258 data = PyUnicode_DATA(v);
5259 }
5260 else {
5261 /* Escaped strings will always be longer than the resulting
5262 Unicode string, so we start with size here and then reduce the
5263 length after conversion to the true value.
5264 (but if the error callback returns a long replacement string
5265 we'll have to allocate more space) */
5266 v = _PyUnicode_New(size);
5267 if (!v)
5268 goto onError;
5269 kind = PyUnicode_WCHAR_KIND;
5270 data = PyUnicode_AS_UNICODE(v);
5271 }
5272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 if (size == 0)
5274 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005275 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005277
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278 while (s < end) {
5279 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005280 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005281 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005283 if (kind == PyUnicode_WCHAR_KIND) {
5284 assert(i < _PyUnicode_WSTR_LENGTH(v));
5285 }
5286 else {
5287 /* The only case in which i == ascii_length is a backslash
5288 followed by a newline. */
5289 assert(i <= ascii_length);
5290 }
5291
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 /* Non-escape characters are interpreted as Unicode ordinals */
5293 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005294 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 continue;
5296 }
5297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005298 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 /* \ - Escapes */
5300 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005301 c = *s++;
5302 if (s > end)
5303 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005304
5305 if (kind == PyUnicode_WCHAR_KIND) {
5306 assert(i < _PyUnicode_WSTR_LENGTH(v));
5307 }
5308 else {
5309 /* The only case in which i == ascii_length is a backslash
5310 followed by a newline. */
5311 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5312 }
5313
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005314 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005318 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5319 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5320 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5321 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5322 /* FF */
5323 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5324 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5325 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5326 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5327 /* VT */
5328 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5329 /* BEL, not classic C */
5330 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 case '0': case '1': case '2': case '3':
5334 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005335 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005336 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005337 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005338 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005339 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005341 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 break;
5343
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 /* hex escapes */
5345 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005347 digits = 2;
5348 message = "truncated \\xXX escape";
5349 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005353 digits = 4;
5354 message = "truncated \\uXXXX escape";
5355 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005358 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005359 digits = 8;
5360 message = "truncated \\UXXXXXXXX escape";
5361 hexescape:
5362 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005363 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 if (s+digits>end) {
5365 endinpos = size;
5366 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 errors, &errorHandler,
5368 "unicodeescape", "end of string in escape sequence",
5369 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005370 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373 goto nextByte;
5374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005375 for (j = 0; j < digits; ++j) {
5376 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005377 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378 endinpos = (s+j+1)-starts;
5379 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 errors, &errorHandler,
5382 "unicodeescape", message,
5383 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005384 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005385 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005388 }
5389 chr = (chr<<4) & ~0xF;
5390 if (c >= '0' && c <= '9')
5391 chr += c - '0';
5392 else if (c >= 'a' && c <= 'f')
5393 chr += 10 + c - 'a';
5394 else
5395 chr += 10 + c - 'A';
5396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005398 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 /* _decoding_error will have already written into the
5400 target buffer. */
5401 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005402 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005403 /* when we get here, chr is a 32-bit unicode character */
5404 if (chr <= 0xffff)
5405 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005407 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005408 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005409 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005410#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005411 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005412#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005413 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005414 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5415 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005416#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005417 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005418 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005419 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005420 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 errors, &errorHandler,
5422 "unicodeescape", "illegal Unicode character",
5423 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005425 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005427 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005428 break;
5429
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005431 case 'N':
5432 message = "malformed \\N character escape";
5433 if (ucnhash_CAPI == NULL) {
5434 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005435 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5436 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005437 if (ucnhash_CAPI == NULL)
5438 goto ucnhashError;
5439 }
5440 if (*s == '{') {
5441 const char *start = s+1;
5442 /* look for the closing brace */
5443 while (*s != '}' && s < end)
5444 s++;
5445 if (s > start && s < end && *s == '}') {
5446 /* found a name. look it up in the unicode database */
5447 message = "unknown Unicode character name";
5448 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5450 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005451 goto store;
5452 }
5453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005455 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005456 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 errors, &errorHandler,
5458 "unicodeescape", message,
5459 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005460 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005461 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005463 break;
5464
5465 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005466 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 message = "\\ at end of string";
5469 s--;
5470 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005471 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005473 errors, &errorHandler,
5474 "unicodeescape", message,
5475 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005476 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005477 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005479 }
5480 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5482 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005483 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005484 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005487 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 /* Ensure the length prediction worked in case of ASCII strings */
5490 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5491
Victor Stinnerfe226c02011-10-03 03:52:20 +02005492 if (kind == PyUnicode_WCHAR_KIND)
5493 {
5494 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5495 goto onError;
5496 if (PyUnicode_READY(v) == -1)
5497 goto onError;
5498 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005502
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005504 PyErr_SetString(
5505 PyExc_UnicodeError,
5506 "\\N escapes not supported (can't load unicodedata module)"
5507 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005508 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 Py_XDECREF(errorHandler);
5510 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005511 return NULL;
5512
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 Py_XDECREF(errorHandler);
5516 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 return NULL;
5518}
5519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005520#undef WRITE_ASCII_OR_WSTR
5521#undef WRITE_WSTR
5522
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523/* Return a Unicode-Escape string version of the Unicode object.
5524
5525 If quotes is true, the string is enclosed in u"" or u'' quotes as
5526 appropriate.
5527
5528*/
5529
Walter Dörwald79e913e2007-05-12 11:08:06 +00005530static const char *hexdigits = "0123456789abcdef";
5531
Alexander Belopolsky40018472011-02-26 01:02:56 +00005532PyObject *
5533PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005534 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005536 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005539#ifdef Py_UNICODE_WIDE
5540 const Py_ssize_t expandsize = 10;
5541#else
5542 const Py_ssize_t expandsize = 6;
5543#endif
5544
Thomas Wouters89f507f2006-12-13 04:49:30 +00005545 /* XXX(nnorwitz): rather than over-allocating, it would be
5546 better to choose a different scheme. Perhaps scan the
5547 first N-chars of the string and allocate based on that size.
5548 */
5549 /* Initial allocation is based on the longest-possible unichr
5550 escape.
5551
5552 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5553 unichr, so in this case it's the longest unichr escape. In
5554 narrow (UTF-16) builds this is five chars per source unichr
5555 since there are two unichrs in the surrogate pair, so in narrow
5556 (UTF-16) builds it's not the longest unichr escape.
5557
5558 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5559 so in the narrow (UTF-16) build case it's the longest unichr
5560 escape.
5561 */
5562
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005563 if (size == 0)
5564 return PyBytes_FromStringAndSize(NULL, 0);
5565
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005566 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005568
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005569 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 2
5571 + expandsize*size
5572 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 if (repr == NULL)
5574 return NULL;
5575
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005576 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 while (size-- > 0) {
5579 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005580
Walter Dörwald79e913e2007-05-12 11:08:06 +00005581 /* Escape backslashes */
5582 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 *p++ = '\\';
5584 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005585 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005586 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005587
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005588#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005589 /* Map 21-bit characters to '\U00xxxxxx' */
5590 else if (ch >= 0x10000) {
5591 *p++ = '\\';
5592 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005593 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5594 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5595 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5596 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5597 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5598 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5599 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5600 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005602 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005603#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5605 else if (ch >= 0xD800 && ch < 0xDC00) {
5606 Py_UNICODE ch2;
5607 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005608
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 ch2 = *s++;
5610 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005611 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5613 *p++ = '\\';
5614 *p++ = 'U';
5615 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5616 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5617 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5618 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5619 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5620 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5621 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5622 *p++ = hexdigits[ucs & 0x0000000F];
5623 continue;
5624 }
5625 /* Fall through: isolated surrogates are copied as-is */
5626 s--;
5627 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005628 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005629#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005632 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 *p++ = '\\';
5634 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005635 *p++ = hexdigits[(ch >> 12) & 0x000F];
5636 *p++ = hexdigits[(ch >> 8) & 0x000F];
5637 *p++ = hexdigits[(ch >> 4) & 0x000F];
5638 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005640
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005641 /* Map special whitespace to '\t', \n', '\r' */
5642 else if (ch == '\t') {
5643 *p++ = '\\';
5644 *p++ = 't';
5645 }
5646 else if (ch == '\n') {
5647 *p++ = '\\';
5648 *p++ = 'n';
5649 }
5650 else if (ch == '\r') {
5651 *p++ = '\\';
5652 *p++ = 'r';
5653 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005654
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005655 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005656 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005658 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005659 *p++ = hexdigits[(ch >> 4) & 0x000F];
5660 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005661 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005662
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 /* Copy everything else as-is */
5664 else
5665 *p++ = (char) ch;
5666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005668 assert(p - PyBytes_AS_STRING(repr) > 0);
5669 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5670 return NULL;
5671 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672}
5673
Alexander Belopolsky40018472011-02-26 01:02:56 +00005674PyObject *
5675PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005677 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 if (!PyUnicode_Check(unicode)) {
5679 PyErr_BadArgument();
5680 return NULL;
5681 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005682 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5683 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005684 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685}
5686
5687/* --- Raw Unicode Escape Codec ------------------------------------------- */
5688
Alexander Belopolsky40018472011-02-26 01:02:56 +00005689PyObject *
5690PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005691 Py_ssize_t size,
5692 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005695 Py_ssize_t startinpos;
5696 Py_ssize_t endinpos;
5697 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 const char *end;
5701 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 PyObject *errorHandler = NULL;
5703 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005704
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 /* Escaped strings will always be longer than the resulting
5706 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 length after conversion to the true value. (But decoding error
5708 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 v = _PyUnicode_New(size);
5710 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005714 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 end = s + size;
5716 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 unsigned char c;
5718 Py_UCS4 x;
5719 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005720 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 /* Non-escape characters are interpreted as Unicode ordinals */
5723 if (*s != '\\') {
5724 *p++ = (unsigned char)*s++;
5725 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005726 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 startinpos = s-starts;
5728
5729 /* \u-escapes are only interpreted iff the number of leading
5730 backslashes if odd */
5731 bs = s;
5732 for (;s < end;) {
5733 if (*s != '\\')
5734 break;
5735 *p++ = (unsigned char)*s++;
5736 }
5737 if (((s - bs) & 1) == 0 ||
5738 s >= end ||
5739 (*s != 'u' && *s != 'U')) {
5740 continue;
5741 }
5742 p--;
5743 count = *s=='u' ? 4 : 8;
5744 s++;
5745
5746 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5747 outpos = p-PyUnicode_AS_UNICODE(v);
5748 for (x = 0, i = 0; i < count; ++i, ++s) {
5749 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005750 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 endinpos = s-starts;
5752 if (unicode_decode_call_errorhandler(
5753 errors, &errorHandler,
5754 "rawunicodeescape", "truncated \\uXXXX",
5755 &starts, &end, &startinpos, &endinpos, &exc, &s,
5756 &v, &outpos, &p))
5757 goto onError;
5758 goto nextByte;
5759 }
5760 x = (x<<4) & ~0xF;
5761 if (c >= '0' && c <= '9')
5762 x += c - '0';
5763 else if (c >= 'a' && c <= 'f')
5764 x += 10 + c - 'a';
5765 else
5766 x += 10 + c - 'A';
5767 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005768 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 /* UCS-2 character */
5770 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005771 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* UCS-4 character. Either store directly, or as
5773 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005774#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005776#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 x -= 0x10000L;
5778 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5779 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005780#endif
5781 } else {
5782 endinpos = s-starts;
5783 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005784 if (unicode_decode_call_errorhandler(
5785 errors, &errorHandler,
5786 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 &starts, &end, &startinpos, &endinpos, &exc, &s,
5788 &v, &outpos, &p))
5789 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 nextByte:
5792 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005794 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 Py_XDECREF(errorHandler);
5797 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005798 if (PyUnicode_READY(v) == -1) {
5799 Py_DECREF(v);
5800 return NULL;
5801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005803
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 Py_XDECREF(errorHandler);
5807 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 return NULL;
5809}
5810
Alexander Belopolsky40018472011-02-26 01:02:56 +00005811PyObject *
5812PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005813 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005815 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 char *p;
5817 char *q;
5818
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005819#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005820 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005821#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005822 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005823#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005824
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005825 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005827
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 if (repr == NULL)
5830 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005831 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005834 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 while (size-- > 0) {
5836 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005837#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 /* Map 32-bit characters to '\Uxxxxxxxx' */
5839 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005840 *p++ = '\\';
5841 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005842 *p++ = hexdigits[(ch >> 28) & 0xf];
5843 *p++ = hexdigits[(ch >> 24) & 0xf];
5844 *p++ = hexdigits[(ch >> 20) & 0xf];
5845 *p++ = hexdigits[(ch >> 16) & 0xf];
5846 *p++ = hexdigits[(ch >> 12) & 0xf];
5847 *p++ = hexdigits[(ch >> 8) & 0xf];
5848 *p++ = hexdigits[(ch >> 4) & 0xf];
5849 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005850 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005851 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005852#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5854 if (ch >= 0xD800 && ch < 0xDC00) {
5855 Py_UNICODE ch2;
5856 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005857
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 ch2 = *s++;
5859 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005860 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5862 *p++ = '\\';
5863 *p++ = 'U';
5864 *p++ = hexdigits[(ucs >> 28) & 0xf];
5865 *p++ = hexdigits[(ucs >> 24) & 0xf];
5866 *p++ = hexdigits[(ucs >> 20) & 0xf];
5867 *p++ = hexdigits[(ucs >> 16) & 0xf];
5868 *p++ = hexdigits[(ucs >> 12) & 0xf];
5869 *p++ = hexdigits[(ucs >> 8) & 0xf];
5870 *p++ = hexdigits[(ucs >> 4) & 0xf];
5871 *p++ = hexdigits[ucs & 0xf];
5872 continue;
5873 }
5874 /* Fall through: isolated surrogates are copied as-is */
5875 s--;
5876 size++;
5877 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005878#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 /* Map 16-bit characters to '\uxxxx' */
5880 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 *p++ = '\\';
5882 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005883 *p++ = hexdigits[(ch >> 12) & 0xf];
5884 *p++ = hexdigits[(ch >> 8) & 0xf];
5885 *p++ = hexdigits[(ch >> 4) & 0xf];
5886 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 /* Copy everything else as-is */
5889 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 *p++ = (char) ch;
5891 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005892 size = p - q;
5893
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005894 assert(size > 0);
5895 if (_PyBytes_Resize(&repr, size) < 0)
5896 return NULL;
5897 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898}
5899
Alexander Belopolsky40018472011-02-26 01:02:56 +00005900PyObject *
5901PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005903 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005905 PyErr_BadArgument();
5906 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005908 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5909 PyUnicode_GET_SIZE(unicode));
5910
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005911 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912}
5913
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005914/* --- Unicode Internal Codec ------------------------------------------- */
5915
Alexander Belopolsky40018472011-02-26 01:02:56 +00005916PyObject *
5917_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005918 Py_ssize_t size,
5919 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005920{
5921 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 Py_ssize_t startinpos;
5923 Py_ssize_t endinpos;
5924 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005925 PyUnicodeObject *v;
5926 Py_UNICODE *p;
5927 const char *end;
5928 const char *reason;
5929 PyObject *errorHandler = NULL;
5930 PyObject *exc = NULL;
5931
Neal Norwitzd43069c2006-01-08 01:12:10 +00005932#ifdef Py_UNICODE_WIDE
5933 Py_UNICODE unimax = PyUnicode_GetMax();
5934#endif
5935
Thomas Wouters89f507f2006-12-13 04:49:30 +00005936 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005937 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5938 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005940 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5941 as string was created with the old API. */
5942 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005944 p = PyUnicode_AS_UNICODE(v);
5945 end = s + size;
5946
5947 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005948 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005949 /* We have to sanity check the raw data, otherwise doom looms for
5950 some malformed UCS-4 data. */
5951 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005952#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005953 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005954#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005955 end-s < Py_UNICODE_SIZE
5956 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005957 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005958 startinpos = s - starts;
5959 if (end-s < Py_UNICODE_SIZE) {
5960 endinpos = end-starts;
5961 reason = "truncated input";
5962 }
5963 else {
5964 endinpos = s - starts + Py_UNICODE_SIZE;
5965 reason = "illegal code point (> 0x10FFFF)";
5966 }
5967 outpos = p - PyUnicode_AS_UNICODE(v);
5968 if (unicode_decode_call_errorhandler(
5969 errors, &errorHandler,
5970 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005971 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005972 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005973 goto onError;
5974 }
5975 }
5976 else {
5977 p++;
5978 s += Py_UNICODE_SIZE;
5979 }
5980 }
5981
Victor Stinnerfe226c02011-10-03 03:52:20 +02005982 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005983 goto onError;
5984 Py_XDECREF(errorHandler);
5985 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986 if (PyUnicode_READY(v) == -1) {
5987 Py_DECREF(v);
5988 return NULL;
5989 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005990 return (PyObject *)v;
5991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005993 Py_XDECREF(v);
5994 Py_XDECREF(errorHandler);
5995 Py_XDECREF(exc);
5996 return NULL;
5997}
5998
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999/* --- Latin-1 Codec ------------------------------------------------------ */
6000
Alexander Belopolsky40018472011-02-26 01:02:56 +00006001PyObject *
6002PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006003 Py_ssize_t size,
6004 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006007 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008}
6009
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006011static void
6012make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006013 const char *encoding,
6014 const Py_UNICODE *unicode, Py_ssize_t size,
6015 Py_ssize_t startpos, Py_ssize_t endpos,
6016 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 *exceptionObject = PyUnicodeEncodeError_Create(
6020 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 }
6022 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6024 goto onError;
6025 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6026 goto onError;
6027 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6028 goto onError;
6029 return;
6030 onError:
6031 Py_DECREF(*exceptionObject);
6032 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 }
6034}
6035
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006037static void
6038raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006039 const char *encoding,
6040 const Py_UNICODE *unicode, Py_ssize_t size,
6041 Py_ssize_t startpos, Py_ssize_t endpos,
6042 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043{
6044 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048}
6049
6050/* error handling callback helper:
6051 build arguments, call the callback and check the arguments,
6052 put the result into newpos and return the replacement string, which
6053 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006054static PyObject *
6055unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006056 PyObject **errorHandler,
6057 const char *encoding, const char *reason,
6058 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6059 Py_ssize_t startpos, Py_ssize_t endpos,
6060 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006062 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063
6064 PyObject *restuple;
6065 PyObject *resunicode;
6066
6067 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 }
6072
6073 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077
6078 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006083 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 Py_DECREF(restuple);
6085 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006087 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 &resunicode, newpos)) {
6089 Py_DECREF(restuple);
6090 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006092 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6093 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6094 Py_DECREF(restuple);
6095 return NULL;
6096 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006097 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006099 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6101 Py_DECREF(restuple);
6102 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006103 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104 Py_INCREF(resunicode);
6105 Py_DECREF(restuple);
6106 return resunicode;
6107}
6108
Alexander Belopolsky40018472011-02-26 01:02:56 +00006109static PyObject *
6110unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006111 Py_ssize_t size,
6112 const char *errors,
6113 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114{
6115 /* output object */
6116 PyObject *res;
6117 /* pointers to the beginning and end+1 of input */
6118 const Py_UNICODE *startp = p;
6119 const Py_UNICODE *endp = p + size;
6120 /* pointer to the beginning of the unencodable characters */
6121 /* const Py_UNICODE *badp = NULL; */
6122 /* pointer into the output */
6123 char *str;
6124 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006125 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006126 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6127 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 PyObject *errorHandler = NULL;
6129 PyObject *exc = NULL;
6130 /* the following variable is used for caching string comparisons
6131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6132 int known_errorHandler = -1;
6133
6134 /* allocate enough for a simple encoding without
6135 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006136 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006137 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006140 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006141 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142 ressize = size;
6143
6144 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 /* can we encode this? */
6148 if (c<limit) {
6149 /* no overflow check, because we know that the space is enough */
6150 *str++ = (char)c;
6151 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006152 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 else {
6154 Py_ssize_t unicodepos = p-startp;
6155 Py_ssize_t requiredsize;
6156 PyObject *repunicode;
6157 Py_ssize_t repsize;
6158 Py_ssize_t newpos;
6159 Py_ssize_t respos;
6160 Py_UNICODE *uni2;
6161 /* startpos for collecting unencodable chars */
6162 const Py_UNICODE *collstart = p;
6163 const Py_UNICODE *collend = p;
6164 /* find all unecodable characters */
6165 while ((collend < endp) && ((*collend)>=limit))
6166 ++collend;
6167 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6168 if (known_errorHandler==-1) {
6169 if ((errors==NULL) || (!strcmp(errors, "strict")))
6170 known_errorHandler = 1;
6171 else if (!strcmp(errors, "replace"))
6172 known_errorHandler = 2;
6173 else if (!strcmp(errors, "ignore"))
6174 known_errorHandler = 3;
6175 else if (!strcmp(errors, "xmlcharrefreplace"))
6176 known_errorHandler = 4;
6177 else
6178 known_errorHandler = 0;
6179 }
6180 switch (known_errorHandler) {
6181 case 1: /* strict */
6182 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6183 goto onError;
6184 case 2: /* replace */
6185 while (collstart++<collend)
6186 *str++ = '?'; /* fall through */
6187 case 3: /* ignore */
6188 p = collend;
6189 break;
6190 case 4: /* xmlcharrefreplace */
6191 respos = str - PyBytes_AS_STRING(res);
6192 /* determine replacement size (temporarily (mis)uses p) */
6193 for (p = collstart, repsize = 0; p < collend; ++p) {
6194 if (*p<10)
6195 repsize += 2+1+1;
6196 else if (*p<100)
6197 repsize += 2+2+1;
6198 else if (*p<1000)
6199 repsize += 2+3+1;
6200 else if (*p<10000)
6201 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006202#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 else
6204 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006205#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 else if (*p<100000)
6207 repsize += 2+5+1;
6208 else if (*p<1000000)
6209 repsize += 2+6+1;
6210 else
6211 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006212#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 }
6214 requiredsize = respos+repsize+(endp-collend);
6215 if (requiredsize > ressize) {
6216 if (requiredsize<2*ressize)
6217 requiredsize = 2*ressize;
6218 if (_PyBytes_Resize(&res, requiredsize))
6219 goto onError;
6220 str = PyBytes_AS_STRING(res) + respos;
6221 ressize = requiredsize;
6222 }
6223 /* generate replacement (temporarily (mis)uses p) */
6224 for (p = collstart; p < collend; ++p) {
6225 str += sprintf(str, "&#%d;", (int)*p);
6226 }
6227 p = collend;
6228 break;
6229 default:
6230 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6231 encoding, reason, startp, size, &exc,
6232 collstart-startp, collend-startp, &newpos);
6233 if (repunicode == NULL)
6234 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006235 if (PyBytes_Check(repunicode)) {
6236 /* Directly copy bytes result to output. */
6237 repsize = PyBytes_Size(repunicode);
6238 if (repsize > 1) {
6239 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006240 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006241 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6242 Py_DECREF(repunicode);
6243 goto onError;
6244 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006245 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006246 ressize += repsize-1;
6247 }
6248 memcpy(str, PyBytes_AsString(repunicode), repsize);
6249 str += repsize;
6250 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006251 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006252 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 /* need more space? (at least enough for what we
6255 have+the replacement+the rest of the string, so
6256 we won't have to check space for encodable characters) */
6257 respos = str - PyBytes_AS_STRING(res);
6258 repsize = PyUnicode_GET_SIZE(repunicode);
6259 requiredsize = respos+repsize+(endp-collend);
6260 if (requiredsize > ressize) {
6261 if (requiredsize<2*ressize)
6262 requiredsize = 2*ressize;
6263 if (_PyBytes_Resize(&res, requiredsize)) {
6264 Py_DECREF(repunicode);
6265 goto onError;
6266 }
6267 str = PyBytes_AS_STRING(res) + respos;
6268 ressize = requiredsize;
6269 }
6270 /* check if there is anything unencodable in the replacement
6271 and copy it to the output */
6272 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6273 c = *uni2;
6274 if (c >= limit) {
6275 raise_encode_exception(&exc, encoding, startp, size,
6276 unicodepos, unicodepos+1, reason);
6277 Py_DECREF(repunicode);
6278 goto onError;
6279 }
6280 *str = (char)c;
6281 }
6282 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006283 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006285 }
6286 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006287 /* Resize if we allocated to much */
6288 size = str - PyBytes_AS_STRING(res);
6289 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006290 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006291 if (_PyBytes_Resize(&res, size) < 0)
6292 goto onError;
6293 }
6294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 Py_XDECREF(errorHandler);
6296 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006297 return res;
6298
6299 onError:
6300 Py_XDECREF(res);
6301 Py_XDECREF(errorHandler);
6302 Py_XDECREF(exc);
6303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304}
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
6307PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006308 Py_ssize_t size,
6309 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006311 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312}
6313
Alexander Belopolsky40018472011-02-26 01:02:56 +00006314PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006315_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316{
6317 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 PyErr_BadArgument();
6319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006321 if (PyUnicode_READY(unicode) == -1)
6322 return NULL;
6323 /* Fast path: if it is a one-byte string, construct
6324 bytes object directly. */
6325 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6326 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6327 PyUnicode_GET_LENGTH(unicode));
6328 /* Non-Latin-1 characters present. Defer to above function to
6329 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006332 errors);
6333}
6334
6335PyObject*
6336PyUnicode_AsLatin1String(PyObject *unicode)
6337{
6338 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339}
6340
6341/* --- 7-bit ASCII Codec -------------------------------------------------- */
6342
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343PyObject *
6344PyUnicode_DecodeASCII(const char *s,
6345 Py_ssize_t size,
6346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 PyUnicodeObject *v;
6350 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006351 Py_ssize_t startinpos;
6352 Py_ssize_t endinpos;
6353 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006355 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 PyObject *errorHandler = NULL;
6357 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006358 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006359
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006361 if (size == 1 && *(unsigned char*)s < 128)
6362 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6363
6364 /* Fast path. Assume the input actually *is* ASCII, and allocate
6365 a single-block Unicode object with that assumption. If there is
6366 an error, drop the object and start over. */
6367 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6368 if (v == NULL)
6369 goto onError;
6370 d = PyUnicode_1BYTE_DATA(v);
6371 for (i = 0; i < size; i++) {
6372 unsigned char ch = ((unsigned char*)s)[i];
6373 if (ch < 128)
6374 d[i] = ch;
6375 else
6376 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006378 if (i == size)
6379 return (PyObject*)v;
6380 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006381
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382 v = _PyUnicode_New(size);
6383 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 e = s + size;
6389 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 register unsigned char c = (unsigned char)*s;
6391 if (c < 128) {
6392 *p++ = c;
6393 ++s;
6394 }
6395 else {
6396 startinpos = s-starts;
6397 endinpos = startinpos + 1;
6398 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6399 if (unicode_decode_call_errorhandler(
6400 errors, &errorHandler,
6401 "ascii", "ordinal not in range(128)",
6402 &starts, &e, &startinpos, &endinpos, &exc, &s,
6403 &v, &outpos, &p))
6404 goto onError;
6405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006407 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006408 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410 Py_XDECREF(errorHandler);
6411 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006412 if (PyUnicode_READY(v) == -1) {
6413 Py_DECREF(v);
6414 return NULL;
6415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006417
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 Py_XDECREF(errorHandler);
6421 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 return NULL;
6423}
6424
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425PyObject *
6426PyUnicode_EncodeASCII(const Py_UNICODE *p,
6427 Py_ssize_t size,
6428 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431}
6432
Alexander Belopolsky40018472011-02-26 01:02:56 +00006433PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006434_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
6436 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 PyErr_BadArgument();
6438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006440 if (PyUnicode_READY(unicode) == -1)
6441 return NULL;
6442 /* Fast path: if it is an ASCII-only string, construct bytes object
6443 directly. Else defer to above function to raise the exception. */
6444 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6445 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6446 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006449 errors);
6450}
6451
6452PyObject *
6453PyUnicode_AsASCIIString(PyObject *unicode)
6454{
6455 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456}
6457
Victor Stinner99b95382011-07-04 14:23:54 +02006458#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006459
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006460/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006461
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006462#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006463#define NEED_RETRY
6464#endif
6465
6466/* XXX This code is limited to "true" double-byte encodings, as
6467 a) it assumes an incomplete character consists of a single byte, and
6468 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006470
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471static int
6472is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006473{
6474 const char *curr = s + offset;
6475
6476 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 const char *prev = CharPrev(s, curr);
6478 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006479 }
6480 return 0;
6481}
6482
6483/*
6484 * Decode MBCS string into unicode object. If 'final' is set, converts
6485 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6486 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006487static int
6488decode_mbcs(PyUnicodeObject **v,
6489 const char *s, /* MBCS string */
6490 int size, /* sizeof MBCS string */
6491 int final,
6492 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006493{
6494 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006495 Py_ssize_t n;
6496 DWORD usize;
6497 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006498
6499 assert(size >= 0);
6500
Victor Stinner554f3f02010-06-16 23:33:54 +00006501 /* check and handle 'errors' arg */
6502 if (errors==NULL || strcmp(errors, "strict")==0)
6503 flags = MB_ERR_INVALID_CHARS;
6504 else if (strcmp(errors, "ignore")==0)
6505 flags = 0;
6506 else {
6507 PyErr_Format(PyExc_ValueError,
6508 "mbcs encoding does not support errors='%s'",
6509 errors);
6510 return -1;
6511 }
6512
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006513 /* Skip trailing lead-byte unless 'final' is set */
6514 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006516
6517 /* First get the size of the result */
6518 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006519 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6520 if (usize==0)
6521 goto mbcs_decode_error;
6522 } else
6523 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006524
6525 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 /* Create unicode object */
6527 *v = _PyUnicode_New(usize);
6528 if (*v == NULL)
6529 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006530 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006531 }
6532 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 /* Extend unicode object */
6534 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006535 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006537 }
6538
6539 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006540 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006542 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6543 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006545 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006546 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006547
6548mbcs_decode_error:
6549 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6550 we raise a UnicodeDecodeError - else it is a 'generic'
6551 windows error
6552 */
6553 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6554 /* Ideally, we should get reason from FormatMessage - this
6555 is the Windows 2000 English version of the message
6556 */
6557 PyObject *exc = NULL;
6558 const char *reason = "No mapping for the Unicode character exists "
6559 "in the target multi-byte code page.";
6560 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6561 if (exc != NULL) {
6562 PyCodec_StrictErrors(exc);
6563 Py_DECREF(exc);
6564 }
6565 } else {
6566 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6567 }
6568 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006569}
6570
Alexander Belopolsky40018472011-02-26 01:02:56 +00006571PyObject *
6572PyUnicode_DecodeMBCSStateful(const char *s,
6573 Py_ssize_t size,
6574 const char *errors,
6575 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576{
6577 PyUnicodeObject *v = NULL;
6578 int done;
6579
6580 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006582
6583#ifdef NEED_RETRY
6584 retry:
6585 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006586 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587 else
6588#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006589 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590
6591 if (done < 0) {
6592 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 }
6595
6596 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006598
6599#ifdef NEED_RETRY
6600 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 s += done;
6602 size -= done;
6603 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604 }
6605#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006606 if (PyUnicode_READY(v) == -1) {
6607 Py_DECREF(v);
6608 return NULL;
6609 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006610 return (PyObject *)v;
6611}
6612
Alexander Belopolsky40018472011-02-26 01:02:56 +00006613PyObject *
6614PyUnicode_DecodeMBCS(const char *s,
6615 Py_ssize_t size,
6616 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006617{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006618 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6619}
6620
6621/*
6622 * Convert unicode into string object (MBCS).
6623 * Returns 0 if succeed, -1 otherwise.
6624 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625static int
6626encode_mbcs(PyObject **repr,
6627 const Py_UNICODE *p, /* unicode */
6628 int size, /* size of unicode */
6629 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006630{
Victor Stinner554f3f02010-06-16 23:33:54 +00006631 BOOL usedDefaultChar = FALSE;
6632 BOOL *pusedDefaultChar;
6633 int mbcssize;
6634 Py_ssize_t n;
6635 PyObject *exc = NULL;
6636 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006637
6638 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006639
Victor Stinner554f3f02010-06-16 23:33:54 +00006640 /* check and handle 'errors' arg */
6641 if (errors==NULL || strcmp(errors, "strict")==0) {
6642 flags = WC_NO_BEST_FIT_CHARS;
6643 pusedDefaultChar = &usedDefaultChar;
6644 } else if (strcmp(errors, "replace")==0) {
6645 flags = 0;
6646 pusedDefaultChar = NULL;
6647 } else {
6648 PyErr_Format(PyExc_ValueError,
6649 "mbcs encoding does not support errors='%s'",
6650 errors);
6651 return -1;
6652 }
6653
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006654 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006655 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006656 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6657 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 if (mbcssize == 0) {
6659 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6660 return -1;
6661 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006662 /* If we used a default char, then we failed! */
6663 if (pusedDefaultChar && *pusedDefaultChar)
6664 goto mbcs_encode_error;
6665 } else {
6666 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006667 }
6668
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006669 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 /* Create string object */
6671 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6672 if (*repr == NULL)
6673 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006674 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006675 }
6676 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 /* Extend string object */
6678 n = PyBytes_Size(*repr);
6679 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6680 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006681 }
6682
6683 /* Do the conversion */
6684 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006686 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6687 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6689 return -1;
6690 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006691 if (pusedDefaultChar && *pusedDefaultChar)
6692 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006693 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006695
6696mbcs_encode_error:
6697 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6698 Py_XDECREF(exc);
6699 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006700}
6701
Alexander Belopolsky40018472011-02-26 01:02:56 +00006702PyObject *
6703PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6704 Py_ssize_t size,
6705 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006706{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707 PyObject *repr = NULL;
6708 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006709
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006712 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006713 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714 else
6715#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006716 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006717
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 Py_XDECREF(repr);
6720 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006721 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722
6723#ifdef NEED_RETRY
6724 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 p += INT_MAX;
6726 size -= INT_MAX;
6727 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006728 }
6729#endif
6730
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006731 return repr;
6732}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006733
Alexander Belopolsky40018472011-02-26 01:02:56 +00006734PyObject *
6735PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006736{
6737 if (!PyUnicode_Check(unicode)) {
6738 PyErr_BadArgument();
6739 return NULL;
6740 }
6741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 PyUnicode_GET_SIZE(unicode),
6743 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006744}
6745
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006746#undef NEED_RETRY
6747
Victor Stinner99b95382011-07-04 14:23:54 +02006748#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006749
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750/* --- Character Mapping Codec -------------------------------------------- */
6751
Alexander Belopolsky40018472011-02-26 01:02:56 +00006752PyObject *
6753PyUnicode_DecodeCharmap(const char *s,
6754 Py_ssize_t size,
6755 PyObject *mapping,
6756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759 Py_ssize_t startinpos;
6760 Py_ssize_t endinpos;
6761 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 PyUnicodeObject *v;
6764 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006765 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766 PyObject *errorHandler = NULL;
6767 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006768 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006770
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 /* Default to Latin-1 */
6772 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774
6775 v = _PyUnicode_New(size);
6776 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006782 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 mapstring = PyUnicode_AS_UNICODE(mapping);
6784 maplen = PyUnicode_GET_SIZE(mapping);
6785 while (s < e) {
6786 unsigned char ch = *s;
6787 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 if (ch < maplen)
6790 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 if (x == 0xfffe) {
6793 /* undefined mapping */
6794 outpos = p-PyUnicode_AS_UNICODE(v);
6795 startinpos = s-starts;
6796 endinpos = startinpos+1;
6797 if (unicode_decode_call_errorhandler(
6798 errors, &errorHandler,
6799 "charmap", "character maps to <undefined>",
6800 &starts, &e, &startinpos, &endinpos, &exc, &s,
6801 &v, &outpos, &p)) {
6802 goto onError;
6803 }
6804 continue;
6805 }
6806 *p++ = x;
6807 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006808 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006809 }
6810 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 while (s < e) {
6812 unsigned char ch = *s;
6813 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006814
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6816 w = PyLong_FromLong((long)ch);
6817 if (w == NULL)
6818 goto onError;
6819 x = PyObject_GetItem(mapping, w);
6820 Py_DECREF(w);
6821 if (x == NULL) {
6822 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6823 /* No mapping found means: mapping is undefined. */
6824 PyErr_Clear();
6825 x = Py_None;
6826 Py_INCREF(x);
6827 } else
6828 goto onError;
6829 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006830
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 /* Apply mapping */
6832 if (PyLong_Check(x)) {
6833 long value = PyLong_AS_LONG(x);
6834 if (value < 0 || value > 65535) {
6835 PyErr_SetString(PyExc_TypeError,
6836 "character mapping must be in range(65536)");
6837 Py_DECREF(x);
6838 goto onError;
6839 }
6840 *p++ = (Py_UNICODE)value;
6841 }
6842 else if (x == Py_None) {
6843 /* undefined mapping */
6844 outpos = p-PyUnicode_AS_UNICODE(v);
6845 startinpos = s-starts;
6846 endinpos = startinpos+1;
6847 if (unicode_decode_call_errorhandler(
6848 errors, &errorHandler,
6849 "charmap", "character maps to <undefined>",
6850 &starts, &e, &startinpos, &endinpos, &exc, &s,
6851 &v, &outpos, &p)) {
6852 Py_DECREF(x);
6853 goto onError;
6854 }
6855 Py_DECREF(x);
6856 continue;
6857 }
6858 else if (PyUnicode_Check(x)) {
6859 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006860
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 if (targetsize == 1)
6862 /* 1-1 mapping */
6863 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006864
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 else if (targetsize > 1) {
6866 /* 1-n mapping */
6867 if (targetsize > extrachars) {
6868 /* resize first */
6869 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6870 Py_ssize_t needed = (targetsize - extrachars) + \
6871 (targetsize << 2);
6872 extrachars += needed;
6873 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006874 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 PyUnicode_GET_SIZE(v) + needed) < 0) {
6876 Py_DECREF(x);
6877 goto onError;
6878 }
6879 p = PyUnicode_AS_UNICODE(v) + oldpos;
6880 }
6881 Py_UNICODE_COPY(p,
6882 PyUnicode_AS_UNICODE(x),
6883 targetsize);
6884 p += targetsize;
6885 extrachars -= targetsize;
6886 }
6887 /* 1-0 mapping: skip the character */
6888 }
6889 else {
6890 /* wrong return value */
6891 PyErr_SetString(PyExc_TypeError,
6892 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006893 Py_DECREF(x);
6894 goto onError;
6895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 Py_DECREF(x);
6897 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 }
6900 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006901 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006903 Py_XDECREF(errorHandler);
6904 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006905 if (PyUnicode_READY(v) == -1) {
6906 Py_DECREF(v);
6907 return NULL;
6908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006910
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006912 Py_XDECREF(errorHandler);
6913 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 Py_XDECREF(v);
6915 return NULL;
6916}
6917
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006918/* Charmap encoding: the lookup table */
6919
Alexander Belopolsky40018472011-02-26 01:02:56 +00006920struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 PyObject_HEAD
6922 unsigned char level1[32];
6923 int count2, count3;
6924 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006925};
6926
6927static PyObject*
6928encoding_map_size(PyObject *obj, PyObject* args)
6929{
6930 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006931 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006933}
6934
6935static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006936 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 PyDoc_STR("Return the size (in bytes) of this object") },
6938 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006939};
6940
6941static void
6942encoding_map_dealloc(PyObject* o)
6943{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006944 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006945}
6946
6947static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006948 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 "EncodingMap", /*tp_name*/
6950 sizeof(struct encoding_map), /*tp_basicsize*/
6951 0, /*tp_itemsize*/
6952 /* methods */
6953 encoding_map_dealloc, /*tp_dealloc*/
6954 0, /*tp_print*/
6955 0, /*tp_getattr*/
6956 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006957 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 0, /*tp_repr*/
6959 0, /*tp_as_number*/
6960 0, /*tp_as_sequence*/
6961 0, /*tp_as_mapping*/
6962 0, /*tp_hash*/
6963 0, /*tp_call*/
6964 0, /*tp_str*/
6965 0, /*tp_getattro*/
6966 0, /*tp_setattro*/
6967 0, /*tp_as_buffer*/
6968 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6969 0, /*tp_doc*/
6970 0, /*tp_traverse*/
6971 0, /*tp_clear*/
6972 0, /*tp_richcompare*/
6973 0, /*tp_weaklistoffset*/
6974 0, /*tp_iter*/
6975 0, /*tp_iternext*/
6976 encoding_map_methods, /*tp_methods*/
6977 0, /*tp_members*/
6978 0, /*tp_getset*/
6979 0, /*tp_base*/
6980 0, /*tp_dict*/
6981 0, /*tp_descr_get*/
6982 0, /*tp_descr_set*/
6983 0, /*tp_dictoffset*/
6984 0, /*tp_init*/
6985 0, /*tp_alloc*/
6986 0, /*tp_new*/
6987 0, /*tp_free*/
6988 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989};
6990
6991PyObject*
6992PyUnicode_BuildEncodingMap(PyObject* string)
6993{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006994 PyObject *result;
6995 struct encoding_map *mresult;
6996 int i;
6997 int need_dict = 0;
6998 unsigned char level1[32];
6999 unsigned char level2[512];
7000 unsigned char *mlevel1, *mlevel2, *mlevel3;
7001 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007002 int kind;
7003 void *data;
7004 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007006 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007007 PyErr_BadArgument();
7008 return NULL;
7009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007010 kind = PyUnicode_KIND(string);
7011 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007012 memset(level1, 0xFF, sizeof level1);
7013 memset(level2, 0xFF, sizeof level2);
7014
7015 /* If there isn't a one-to-one mapping of NULL to \0,
7016 or if there are non-BMP characters, we need to use
7017 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007018 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007019 need_dict = 1;
7020 for (i = 1; i < 256; i++) {
7021 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007022 ch = PyUnicode_READ(kind, data, i);
7023 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007024 need_dict = 1;
7025 break;
7026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007027 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007028 /* unmapped character */
7029 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007030 l1 = ch >> 11;
7031 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007032 if (level1[l1] == 0xFF)
7033 level1[l1] = count2++;
7034 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007035 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007036 }
7037
7038 if (count2 >= 0xFF || count3 >= 0xFF)
7039 need_dict = 1;
7040
7041 if (need_dict) {
7042 PyObject *result = PyDict_New();
7043 PyObject *key, *value;
7044 if (!result)
7045 return NULL;
7046 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007047 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007048 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007049 if (!key || !value)
7050 goto failed1;
7051 if (PyDict_SetItem(result, key, value) == -1)
7052 goto failed1;
7053 Py_DECREF(key);
7054 Py_DECREF(value);
7055 }
7056 return result;
7057 failed1:
7058 Py_XDECREF(key);
7059 Py_XDECREF(value);
7060 Py_DECREF(result);
7061 return NULL;
7062 }
7063
7064 /* Create a three-level trie */
7065 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7066 16*count2 + 128*count3 - 1);
7067 if (!result)
7068 return PyErr_NoMemory();
7069 PyObject_Init(result, &EncodingMapType);
7070 mresult = (struct encoding_map*)result;
7071 mresult->count2 = count2;
7072 mresult->count3 = count3;
7073 mlevel1 = mresult->level1;
7074 mlevel2 = mresult->level23;
7075 mlevel3 = mresult->level23 + 16*count2;
7076 memcpy(mlevel1, level1, 32);
7077 memset(mlevel2, 0xFF, 16*count2);
7078 memset(mlevel3, 0, 128*count3);
7079 count3 = 0;
7080 for (i = 1; i < 256; i++) {
7081 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007082 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007083 /* unmapped character */
7084 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007085 o1 = PyUnicode_READ(kind, data, i)>>11;
7086 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007087 i2 = 16*mlevel1[o1] + o2;
7088 if (mlevel2[i2] == 0xFF)
7089 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007090 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007091 i3 = 128*mlevel2[i2] + o3;
7092 mlevel3[i3] = i;
7093 }
7094 return result;
7095}
7096
7097static int
7098encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7099{
7100 struct encoding_map *map = (struct encoding_map*)mapping;
7101 int l1 = c>>11;
7102 int l2 = (c>>7) & 0xF;
7103 int l3 = c & 0x7F;
7104 int i;
7105
7106#ifdef Py_UNICODE_WIDE
7107 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007108 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007109 }
7110#endif
7111 if (c == 0)
7112 return 0;
7113 /* level 1*/
7114 i = map->level1[l1];
7115 if (i == 0xFF) {
7116 return -1;
7117 }
7118 /* level 2*/
7119 i = map->level23[16*i+l2];
7120 if (i == 0xFF) {
7121 return -1;
7122 }
7123 /* level 3 */
7124 i = map->level23[16*map->count2 + 128*i + l3];
7125 if (i == 0) {
7126 return -1;
7127 }
7128 return i;
7129}
7130
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007131/* Lookup the character ch in the mapping. If the character
7132 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007133 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134static PyObject *
7135charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
Christian Heimes217cfd12007-12-02 14:31:20 +00007137 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007138 PyObject *x;
7139
7140 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007142 x = PyObject_GetItem(mapping, w);
7143 Py_DECREF(w);
7144 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7146 /* No mapping found means: mapping is undefined. */
7147 PyErr_Clear();
7148 x = Py_None;
7149 Py_INCREF(x);
7150 return x;
7151 } else
7152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007154 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007156 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 long value = PyLong_AS_LONG(x);
7158 if (value < 0 || value > 255) {
7159 PyErr_SetString(PyExc_TypeError,
7160 "character mapping must be in range(256)");
7161 Py_DECREF(x);
7162 return NULL;
7163 }
7164 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007166 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 /* wrong return value */
7170 PyErr_Format(PyExc_TypeError,
7171 "character mapping must return integer, bytes or None, not %.400s",
7172 x->ob_type->tp_name);
7173 Py_DECREF(x);
7174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 }
7176}
7177
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007178static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007179charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007181 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7182 /* exponentially overallocate to minimize reallocations */
7183 if (requiredsize < 2*outsize)
7184 requiredsize = 2*outsize;
7185 if (_PyBytes_Resize(outobj, requiredsize))
7186 return -1;
7187 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188}
7189
Benjamin Peterson14339b62009-01-31 16:36:08 +00007190typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007192} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007194 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195 space is available. Return a new reference to the object that
7196 was put in the output buffer, or Py_None, if the mapping was undefined
7197 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007198 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007199static charmapencode_result
7200charmapencode_output(Py_UNICODE c, PyObject *mapping,
7201 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007203 PyObject *rep;
7204 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007205 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007206
Christian Heimes90aa7642007-12-19 02:45:37 +00007207 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007208 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007210 if (res == -1)
7211 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 if (outsize<requiredsize)
7213 if (charmapencode_resize(outobj, outpos, requiredsize))
7214 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007215 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 outstart[(*outpos)++] = (char)res;
7217 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007218 }
7219
7220 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007221 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007223 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 Py_DECREF(rep);
7225 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007226 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 if (PyLong_Check(rep)) {
7228 Py_ssize_t requiredsize = *outpos+1;
7229 if (outsize<requiredsize)
7230 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7231 Py_DECREF(rep);
7232 return enc_EXCEPTION;
7233 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007234 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007236 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 else {
7238 const char *repchars = PyBytes_AS_STRING(rep);
7239 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7240 Py_ssize_t requiredsize = *outpos+repsize;
7241 if (outsize<requiredsize)
7242 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7243 Py_DECREF(rep);
7244 return enc_EXCEPTION;
7245 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007246 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 memcpy(outstart + *outpos, repchars, repsize);
7248 *outpos += repsize;
7249 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007251 Py_DECREF(rep);
7252 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253}
7254
7255/* handle an error in PyUnicode_EncodeCharmap
7256 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007257static int
7258charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007261 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007262 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263{
7264 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007265 Py_ssize_t repsize;
7266 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267 Py_UNICODE *uni2;
7268 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007269 Py_ssize_t collstartpos = *inpos;
7270 Py_ssize_t collendpos = *inpos+1;
7271 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272 char *encoding = "charmap";
7273 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007274 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007276 /* find all unencodable characters */
7277 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007278 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007279 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 int res = encoding_map_lookup(p[collendpos], mapping);
7281 if (res != -1)
7282 break;
7283 ++collendpos;
7284 continue;
7285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007286
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 rep = charmapencode_lookup(p[collendpos], mapping);
7288 if (rep==NULL)
7289 return -1;
7290 else if (rep!=Py_None) {
7291 Py_DECREF(rep);
7292 break;
7293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007294 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007296 }
7297 /* cache callback name lookup
7298 * (if not done yet, i.e. it's the first error) */
7299 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 if ((errors==NULL) || (!strcmp(errors, "strict")))
7301 *known_errorHandler = 1;
7302 else if (!strcmp(errors, "replace"))
7303 *known_errorHandler = 2;
7304 else if (!strcmp(errors, "ignore"))
7305 *known_errorHandler = 3;
7306 else if (!strcmp(errors, "xmlcharrefreplace"))
7307 *known_errorHandler = 4;
7308 else
7309 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310 }
7311 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007312 case 1: /* strict */
7313 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7314 return -1;
7315 case 2: /* replace */
7316 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 x = charmapencode_output('?', mapping, res, respos);
7318 if (x==enc_EXCEPTION) {
7319 return -1;
7320 }
7321 else if (x==enc_FAILED) {
7322 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7323 return -1;
7324 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007325 }
7326 /* fall through */
7327 case 3: /* ignore */
7328 *inpos = collendpos;
7329 break;
7330 case 4: /* xmlcharrefreplace */
7331 /* generate replacement (temporarily (mis)uses p) */
7332 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 char buffer[2+29+1+1];
7334 char *cp;
7335 sprintf(buffer, "&#%d;", (int)p[collpos]);
7336 for (cp = buffer; *cp; ++cp) {
7337 x = charmapencode_output(*cp, mapping, res, respos);
7338 if (x==enc_EXCEPTION)
7339 return -1;
7340 else if (x==enc_FAILED) {
7341 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7342 return -1;
7343 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007344 }
7345 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 *inpos = collendpos;
7347 break;
7348 default:
7349 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 encoding, reason, p, size, exceptionObject,
7351 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007354 if (PyBytes_Check(repunicode)) {
7355 /* Directly copy bytes result to output. */
7356 Py_ssize_t outsize = PyBytes_Size(*res);
7357 Py_ssize_t requiredsize;
7358 repsize = PyBytes_Size(repunicode);
7359 requiredsize = *respos + repsize;
7360 if (requiredsize > outsize)
7361 /* Make room for all additional bytes. */
7362 if (charmapencode_resize(res, respos, requiredsize)) {
7363 Py_DECREF(repunicode);
7364 return -1;
7365 }
7366 memcpy(PyBytes_AsString(*res) + *respos,
7367 PyBytes_AsString(repunicode), repsize);
7368 *respos += repsize;
7369 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007370 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007371 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007373 /* generate replacement */
7374 repsize = PyUnicode_GET_SIZE(repunicode);
7375 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 x = charmapencode_output(*uni2, mapping, res, respos);
7377 if (x==enc_EXCEPTION) {
7378 return -1;
7379 }
7380 else if (x==enc_FAILED) {
7381 Py_DECREF(repunicode);
7382 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7383 return -1;
7384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007385 }
7386 *inpos = newpos;
7387 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 }
7389 return 0;
7390}
7391
Alexander Belopolsky40018472011-02-26 01:02:56 +00007392PyObject *
7393PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7394 Py_ssize_t size,
7395 PyObject *mapping,
7396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007397{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007398 /* output object */
7399 PyObject *res = NULL;
7400 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007401 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007402 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007403 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007404 PyObject *errorHandler = NULL;
7405 PyObject *exc = NULL;
7406 /* the following variable is used for caching string comparisons
7407 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7408 * 3=ignore, 4=xmlcharrefreplace */
7409 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
7411 /* Default to Latin-1 */
7412 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415 /* allocate enough for a simple encoding without
7416 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007417 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007418 if (res == NULL)
7419 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007420 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007423 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 /* try to encode it */
7425 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7426 if (x==enc_EXCEPTION) /* error */
7427 goto onError;
7428 if (x==enc_FAILED) { /* unencodable character */
7429 if (charmap_encoding_error(p, size, &inpos, mapping,
7430 &exc,
7431 &known_errorHandler, &errorHandler, errors,
7432 &res, &respos)) {
7433 goto onError;
7434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 else
7437 /* done with this character => adjust input position */
7438 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007442 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007443 if (_PyBytes_Resize(&res, respos) < 0)
7444 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 Py_XDECREF(exc);
7447 Py_XDECREF(errorHandler);
7448 return res;
7449
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451 Py_XDECREF(res);
7452 Py_XDECREF(exc);
7453 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 return NULL;
7455}
7456
Alexander Belopolsky40018472011-02-26 01:02:56 +00007457PyObject *
7458PyUnicode_AsCharmapString(PyObject *unicode,
7459 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460{
7461 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 PyErr_BadArgument();
7463 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 }
7465 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 PyUnicode_GET_SIZE(unicode),
7467 mapping,
7468 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469}
7470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007471/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007472static void
7473make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007475 Py_ssize_t startpos, Py_ssize_t endpos,
7476 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007478 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007479 *exceptionObject = _PyUnicodeTranslateError_Create(
7480 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 }
7482 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7484 goto onError;
7485 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7486 goto onError;
7487 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7488 goto onError;
7489 return;
7490 onError:
7491 Py_DECREF(*exceptionObject);
7492 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007493 }
7494}
7495
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007496/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497static void
7498raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007499 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007500 Py_ssize_t startpos, Py_ssize_t endpos,
7501 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502{
7503 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007504 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007505 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507}
7508
7509/* error handling callback helper:
7510 build arguments, call the callback and check the arguments,
7511 put the result into newpos and return the replacement string, which
7512 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007513static PyObject *
7514unicode_translate_call_errorhandler(const char *errors,
7515 PyObject **errorHandler,
7516 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007517 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007518 Py_ssize_t startpos, Py_ssize_t endpos,
7519 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007520{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007521 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007522
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007523 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524 PyObject *restuple;
7525 PyObject *resunicode;
7526
7527 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007531 }
7532
7533 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007534 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007535 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537
7538 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007542 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007543 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 Py_DECREF(restuple);
7545 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007546 }
7547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 &resunicode, &i_newpos)) {
7549 Py_DECREF(restuple);
7550 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007552 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007553 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007554 else
7555 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7558 Py_DECREF(restuple);
7559 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007560 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561 Py_INCREF(resunicode);
7562 Py_DECREF(restuple);
7563 return resunicode;
7564}
7565
7566/* Lookup the character ch in the mapping and put the result in result,
7567 which must be decrefed by the caller.
7568 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007570charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007571{
Christian Heimes217cfd12007-12-02 14:31:20 +00007572 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007573 PyObject *x;
7574
7575 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577 x = PyObject_GetItem(mapping, w);
7578 Py_DECREF(w);
7579 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7581 /* No mapping found means: use 1:1 mapping. */
7582 PyErr_Clear();
7583 *result = NULL;
7584 return 0;
7585 } else
7586 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 }
7588 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 *result = x;
7590 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007592 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 long value = PyLong_AS_LONG(x);
7594 long max = PyUnicode_GetMax();
7595 if (value < 0 || value > max) {
7596 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007597 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 Py_DECREF(x);
7599 return -1;
7600 }
7601 *result = x;
7602 return 0;
7603 }
7604 else if (PyUnicode_Check(x)) {
7605 *result = x;
7606 return 0;
7607 }
7608 else {
7609 /* wrong return value */
7610 PyErr_SetString(PyExc_TypeError,
7611 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007612 Py_DECREF(x);
7613 return -1;
7614 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007615}
7616/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 if not reallocate and adjust various state variables.
7618 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007619static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007623 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007624 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 /* exponentially overallocate to minimize reallocations */
7626 if (requiredsize < 2 * oldsize)
7627 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007628 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7629 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632 }
7633 return 0;
7634}
7635/* lookup the character, put the result in the output string and adjust
7636 various state variables. Return a new reference to the object that
7637 was put in the output buffer in *result, or Py_None, if the mapping was
7638 undefined (in which case no character was written).
7639 The called must decref result.
7640 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007641static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007642charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7643 PyObject *mapping, Py_UCS4 **output,
7644 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007645 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007646{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007647 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7648 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007652 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653 }
7654 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007656 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007658 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007659 }
7660 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007661 Py_ssize_t repsize;
7662 if (PyUnicode_READY(*res) == -1)
7663 return -1;
7664 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 if (repsize==1) {
7666 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007667 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 }
7669 else if (repsize!=0) {
7670 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007671 Py_ssize_t requiredsize = *opos +
7672 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007674 Py_ssize_t i;
7675 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007677 for(i = 0; i < repsize; i++)
7678 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 }
7681 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683 return 0;
7684}
7685
Alexander Belopolsky40018472011-02-26 01:02:56 +00007686PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687_PyUnicode_TranslateCharmap(PyObject *input,
7688 PyObject *mapping,
7689 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007691 /* input object */
7692 char *idata;
7693 Py_ssize_t size, i;
7694 int kind;
7695 /* output buffer */
7696 Py_UCS4 *output = NULL;
7697 Py_ssize_t osize;
7698 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007699 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007700 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701 char *reason = "character maps to <undefined>";
7702 PyObject *errorHandler = NULL;
7703 PyObject *exc = NULL;
7704 /* the following variable is used for caching string comparisons
7705 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7706 * 3=ignore, 4=xmlcharrefreplace */
7707 int known_errorHandler = -1;
7708
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 PyErr_BadArgument();
7711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007714 if (PyUnicode_READY(input) == -1)
7715 return NULL;
7716 idata = (char*)PyUnicode_DATA(input);
7717 kind = PyUnicode_KIND(input);
7718 size = PyUnicode_GET_LENGTH(input);
7719 i = 0;
7720
7721 if (size == 0) {
7722 Py_INCREF(input);
7723 return input;
7724 }
7725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007726 /* allocate enough for a simple 1:1 translation without
7727 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 osize = size;
7729 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7730 opos = 0;
7731 if (output == NULL) {
7732 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007736 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 /* try to encode it */
7738 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007739 if (charmaptranslate_output(input, i, mapping,
7740 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 Py_XDECREF(x);
7742 goto onError;
7743 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007746 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 else { /* untranslatable character */
7748 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7749 Py_ssize_t repsize;
7750 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007751 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007753 Py_ssize_t collstart = i;
7754 Py_ssize_t collend = i+1;
7755 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007758 while (collend < size) {
7759 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 goto onError;
7761 Py_XDECREF(x);
7762 if (x!=Py_None)
7763 break;
7764 ++collend;
7765 }
7766 /* cache callback name lookup
7767 * (if not done yet, i.e. it's the first error) */
7768 if (known_errorHandler==-1) {
7769 if ((errors==NULL) || (!strcmp(errors, "strict")))
7770 known_errorHandler = 1;
7771 else if (!strcmp(errors, "replace"))
7772 known_errorHandler = 2;
7773 else if (!strcmp(errors, "ignore"))
7774 known_errorHandler = 3;
7775 else if (!strcmp(errors, "xmlcharrefreplace"))
7776 known_errorHandler = 4;
7777 else
7778 known_errorHandler = 0;
7779 }
7780 switch (known_errorHandler) {
7781 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007782 raise_translate_exception(&exc, input, collstart,
7783 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 case 2: /* replace */
7786 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787 for (coll = collstart; coll<collend; coll++)
7788 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 /* fall through */
7790 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 break;
7793 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 /* generate replacement (temporarily (mis)uses i) */
7795 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 char buffer[2+29+1+1];
7797 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7799 if (charmaptranslate_makespace(&output, &osize,
7800 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 goto onError;
7802 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 break;
7807 default:
7808 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 reason, input, &exc,
7810 collstart, collend, &newpos);
7811 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 goto onError;
7813 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 repsize = PyUnicode_GET_LENGTH(repunicode);
7815 if (charmaptranslate_makespace(&output, &osize,
7816 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007817 Py_DECREF(repunicode);
7818 goto onError;
7819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 for (uni2 = 0; repsize-->0; ++uni2)
7821 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7822 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007824 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007825 }
7826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007827 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7828 if (!res)
7829 goto onError;
7830 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007831 Py_XDECREF(exc);
7832 Py_XDECREF(errorHandler);
7833 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007837 Py_XDECREF(exc);
7838 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 return NULL;
7840}
7841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007842/* Deprecated. Use PyUnicode_Translate instead. */
7843PyObject *
7844PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7845 Py_ssize_t size,
7846 PyObject *mapping,
7847 const char *errors)
7848{
7849 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7850 if (!unicode)
7851 return NULL;
7852 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7853}
7854
Alexander Belopolsky40018472011-02-26 01:02:56 +00007855PyObject *
7856PyUnicode_Translate(PyObject *str,
7857 PyObject *mapping,
7858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859{
7860 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007861
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 str = PyUnicode_FromObject(str);
7863 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 Py_DECREF(str);
7867 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007868
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 Py_XDECREF(str);
7871 return NULL;
7872}
Tim Petersced69f82003-09-16 20:30:58 +00007873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007874static Py_UCS4
7875fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7876{
7877 /* No need to call PyUnicode_READY(self) because this function is only
7878 called as a callback from fixup() which does it already. */
7879 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7880 const int kind = PyUnicode_KIND(self);
7881 void *data = PyUnicode_DATA(self);
7882 Py_UCS4 maxchar = 0, ch, fixed;
7883 Py_ssize_t i;
7884
7885 for (i = 0; i < len; ++i) {
7886 ch = PyUnicode_READ(kind, data, i);
7887 fixed = 0;
7888 if (ch > 127) {
7889 if (Py_UNICODE_ISSPACE(ch))
7890 fixed = ' ';
7891 else {
7892 const int decimal = Py_UNICODE_TODECIMAL(ch);
7893 if (decimal >= 0)
7894 fixed = '0' + decimal;
7895 }
7896 if (fixed != 0) {
7897 if (fixed > maxchar)
7898 maxchar = fixed;
7899 PyUnicode_WRITE(kind, data, i, fixed);
7900 }
7901 else if (ch > maxchar)
7902 maxchar = ch;
7903 }
7904 else if (ch > maxchar)
7905 maxchar = ch;
7906 }
7907
7908 return maxchar;
7909}
7910
7911PyObject *
7912_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7913{
7914 if (!PyUnicode_Check(unicode)) {
7915 PyErr_BadInternalCall();
7916 return NULL;
7917 }
7918 if (PyUnicode_READY(unicode) == -1)
7919 return NULL;
7920 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7921 /* If the string is already ASCII, just return the same string */
7922 Py_INCREF(unicode);
7923 return unicode;
7924 }
7925 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7926}
7927
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007928PyObject *
7929PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7930 Py_ssize_t length)
7931{
7932 PyObject *result;
7933 Py_UNICODE *p; /* write pointer into result */
7934 Py_ssize_t i;
7935 /* Copy to a new string */
7936 result = (PyObject *)_PyUnicode_New(length);
7937 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7938 if (result == NULL)
7939 return result;
7940 p = PyUnicode_AS_UNICODE(result);
7941 /* Iterate over code points */
7942 for (i = 0; i < length; i++) {
7943 Py_UNICODE ch =s[i];
7944 if (ch > 127) {
7945 int decimal = Py_UNICODE_TODECIMAL(ch);
7946 if (decimal >= 0)
7947 p[i] = '0' + decimal;
7948 }
7949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007950 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7951 Py_DECREF(result);
7952 return NULL;
7953 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007954 return result;
7955}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007956/* --- Decimal Encoder ---------------------------------------------------- */
7957
Alexander Belopolsky40018472011-02-26 01:02:56 +00007958int
7959PyUnicode_EncodeDecimal(Py_UNICODE *s,
7960 Py_ssize_t length,
7961 char *output,
7962 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007963{
7964 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007965 PyObject *errorHandler = NULL;
7966 PyObject *exc = NULL;
7967 const char *encoding = "decimal";
7968 const char *reason = "invalid decimal Unicode string";
7969 /* the following variable is used for caching string comparisons
7970 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7971 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007972
7973 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 PyErr_BadArgument();
7975 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007976 }
7977
7978 p = s;
7979 end = s + length;
7980 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 register Py_UNICODE ch = *p;
7982 int decimal;
7983 PyObject *repunicode;
7984 Py_ssize_t repsize;
7985 Py_ssize_t newpos;
7986 Py_UNICODE *uni2;
7987 Py_UNICODE *collstart;
7988 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007989
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007991 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 ++p;
7993 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 decimal = Py_UNICODE_TODECIMAL(ch);
7996 if (decimal >= 0) {
7997 *output++ = '0' + decimal;
7998 ++p;
7999 continue;
8000 }
8001 if (0 < ch && ch < 256) {
8002 *output++ = (char)ch;
8003 ++p;
8004 continue;
8005 }
8006 /* All other characters are considered unencodable */
8007 collstart = p;
8008 collend = p+1;
8009 while (collend < end) {
8010 if ((0 < *collend && *collend < 256) ||
8011 !Py_UNICODE_ISSPACE(*collend) ||
8012 Py_UNICODE_TODECIMAL(*collend))
8013 break;
8014 }
8015 /* cache callback name lookup
8016 * (if not done yet, i.e. it's the first error) */
8017 if (known_errorHandler==-1) {
8018 if ((errors==NULL) || (!strcmp(errors, "strict")))
8019 known_errorHandler = 1;
8020 else if (!strcmp(errors, "replace"))
8021 known_errorHandler = 2;
8022 else if (!strcmp(errors, "ignore"))
8023 known_errorHandler = 3;
8024 else if (!strcmp(errors, "xmlcharrefreplace"))
8025 known_errorHandler = 4;
8026 else
8027 known_errorHandler = 0;
8028 }
8029 switch (known_errorHandler) {
8030 case 1: /* strict */
8031 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8032 goto onError;
8033 case 2: /* replace */
8034 for (p = collstart; p < collend; ++p)
8035 *output++ = '?';
8036 /* fall through */
8037 case 3: /* ignore */
8038 p = collend;
8039 break;
8040 case 4: /* xmlcharrefreplace */
8041 /* generate replacement (temporarily (mis)uses p) */
8042 for (p = collstart; p < collend; ++p)
8043 output += sprintf(output, "&#%d;", (int)*p);
8044 p = collend;
8045 break;
8046 default:
8047 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8048 encoding, reason, s, length, &exc,
8049 collstart-s, collend-s, &newpos);
8050 if (repunicode == NULL)
8051 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008052 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008053 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008054 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8055 Py_DECREF(repunicode);
8056 goto onError;
8057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 /* generate replacement */
8059 repsize = PyUnicode_GET_SIZE(repunicode);
8060 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8061 Py_UNICODE ch = *uni2;
8062 if (Py_UNICODE_ISSPACE(ch))
8063 *output++ = ' ';
8064 else {
8065 decimal = Py_UNICODE_TODECIMAL(ch);
8066 if (decimal >= 0)
8067 *output++ = '0' + decimal;
8068 else if (0 < ch && ch < 256)
8069 *output++ = (char)ch;
8070 else {
8071 Py_DECREF(repunicode);
8072 raise_encode_exception(&exc, encoding,
8073 s, length, collstart-s, collend-s, reason);
8074 goto onError;
8075 }
8076 }
8077 }
8078 p = s + newpos;
8079 Py_DECREF(repunicode);
8080 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008081 }
8082 /* 0-terminate the output string */
8083 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 Py_XDECREF(exc);
8085 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008086 return 0;
8087
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 Py_XDECREF(exc);
8090 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008091 return -1;
8092}
8093
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094/* --- Helpers ------------------------------------------------------------ */
8095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096#include "stringlib/ucs1lib.h"
8097#include "stringlib/fastsearch.h"
8098#include "stringlib/partition.h"
8099#include "stringlib/split.h"
8100#include "stringlib/count.h"
8101#include "stringlib/find.h"
8102#include "stringlib/localeutil.h"
8103#include "stringlib/undef.h"
8104
8105#include "stringlib/ucs2lib.h"
8106#include "stringlib/fastsearch.h"
8107#include "stringlib/partition.h"
8108#include "stringlib/split.h"
8109#include "stringlib/count.h"
8110#include "stringlib/find.h"
8111#include "stringlib/localeutil.h"
8112#include "stringlib/undef.h"
8113
8114#include "stringlib/ucs4lib.h"
8115#include "stringlib/fastsearch.h"
8116#include "stringlib/partition.h"
8117#include "stringlib/split.h"
8118#include "stringlib/count.h"
8119#include "stringlib/find.h"
8120#include "stringlib/localeutil.h"
8121#include "stringlib/undef.h"
8122
8123static Py_ssize_t
8124any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8125 const Py_UCS1*, Py_ssize_t,
8126 Py_ssize_t, Py_ssize_t),
8127 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8128 const Py_UCS2*, Py_ssize_t,
8129 Py_ssize_t, Py_ssize_t),
8130 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8131 const Py_UCS4*, Py_ssize_t,
8132 Py_ssize_t, Py_ssize_t),
8133 PyObject* s1, PyObject* s2,
8134 Py_ssize_t start,
8135 Py_ssize_t end)
8136{
8137 int kind1, kind2, kind;
8138 void *buf1, *buf2;
8139 Py_ssize_t len1, len2, result;
8140
8141 kind1 = PyUnicode_KIND(s1);
8142 kind2 = PyUnicode_KIND(s2);
8143 kind = kind1 > kind2 ? kind1 : kind2;
8144 buf1 = PyUnicode_DATA(s1);
8145 buf2 = PyUnicode_DATA(s2);
8146 if (kind1 != kind)
8147 buf1 = _PyUnicode_AsKind(s1, kind);
8148 if (!buf1)
8149 return -2;
8150 if (kind2 != kind)
8151 buf2 = _PyUnicode_AsKind(s2, kind);
8152 if (!buf2) {
8153 if (kind1 != kind) PyMem_Free(buf1);
8154 return -2;
8155 }
8156 len1 = PyUnicode_GET_LENGTH(s1);
8157 len2 = PyUnicode_GET_LENGTH(s2);
8158
8159 switch(kind) {
8160 case PyUnicode_1BYTE_KIND:
8161 result = ucs1(buf1, len1, buf2, len2, start, end);
8162 break;
8163 case PyUnicode_2BYTE_KIND:
8164 result = ucs2(buf1, len1, buf2, len2, start, end);
8165 break;
8166 case PyUnicode_4BYTE_KIND:
8167 result = ucs4(buf1, len1, buf2, len2, start, end);
8168 break;
8169 default:
8170 assert(0); result = -2;
8171 }
8172
8173 if (kind1 != kind)
8174 PyMem_Free(buf1);
8175 if (kind2 != kind)
8176 PyMem_Free(buf2);
8177
8178 return result;
8179}
8180
8181Py_ssize_t
8182_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8183 Py_ssize_t n_buffer,
8184 void *digits, Py_ssize_t n_digits,
8185 Py_ssize_t min_width,
8186 const char *grouping,
8187 const char *thousands_sep)
8188{
8189 switch(kind) {
8190 case PyUnicode_1BYTE_KIND:
8191 return _PyUnicode_ucs1_InsertThousandsGrouping(
8192 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8193 min_width, grouping, thousands_sep);
8194 case PyUnicode_2BYTE_KIND:
8195 return _PyUnicode_ucs2_InsertThousandsGrouping(
8196 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8197 min_width, grouping, thousands_sep);
8198 case PyUnicode_4BYTE_KIND:
8199 return _PyUnicode_ucs4_InsertThousandsGrouping(
8200 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8201 min_width, grouping, thousands_sep);
8202 }
8203 assert(0);
8204 return -1;
8205}
8206
8207
Eric Smith8c663262007-08-25 02:26:07 +00008208#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008209#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008210
Thomas Wouters477c8d52006-05-27 19:21:47 +00008211#include "stringlib/count.h"
8212#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008213
Thomas Wouters477c8d52006-05-27 19:21:47 +00008214/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008215#define ADJUST_INDICES(start, end, len) \
8216 if (end > len) \
8217 end = len; \
8218 else if (end < 0) { \
8219 end += len; \
8220 if (end < 0) \
8221 end = 0; \
8222 } \
8223 if (start < 0) { \
8224 start += len; \
8225 if (start < 0) \
8226 start = 0; \
8227 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228
Alexander Belopolsky40018472011-02-26 01:02:56 +00008229Py_ssize_t
8230PyUnicode_Count(PyObject *str,
8231 PyObject *substr,
8232 Py_ssize_t start,
8233 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008235 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008236 PyUnicodeObject* str_obj;
8237 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 int kind1, kind2, kind;
8239 void *buf1 = NULL, *buf2 = NULL;
8240 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008241
Thomas Wouters477c8d52006-05-27 19:21:47 +00008242 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008245 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008246 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 Py_DECREF(str_obj);
8248 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249 }
Tim Petersced69f82003-09-16 20:30:58 +00008250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 kind1 = PyUnicode_KIND(str_obj);
8252 kind2 = PyUnicode_KIND(sub_obj);
8253 kind = kind1 > kind2 ? kind1 : kind2;
8254 buf1 = PyUnicode_DATA(str_obj);
8255 if (kind1 != kind)
8256 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8257 if (!buf1)
8258 goto onError;
8259 buf2 = PyUnicode_DATA(sub_obj);
8260 if (kind2 != kind)
8261 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8262 if (!buf2)
8263 goto onError;
8264 len1 = PyUnicode_GET_LENGTH(str_obj);
8265 len2 = PyUnicode_GET_LENGTH(sub_obj);
8266
8267 ADJUST_INDICES(start, end, len1);
8268 switch(kind) {
8269 case PyUnicode_1BYTE_KIND:
8270 result = ucs1lib_count(
8271 ((Py_UCS1*)buf1) + start, end - start,
8272 buf2, len2, PY_SSIZE_T_MAX
8273 );
8274 break;
8275 case PyUnicode_2BYTE_KIND:
8276 result = ucs2lib_count(
8277 ((Py_UCS2*)buf1) + start, end - start,
8278 buf2, len2, PY_SSIZE_T_MAX
8279 );
8280 break;
8281 case PyUnicode_4BYTE_KIND:
8282 result = ucs4lib_count(
8283 ((Py_UCS4*)buf1) + start, end - start,
8284 buf2, len2, PY_SSIZE_T_MAX
8285 );
8286 break;
8287 default:
8288 assert(0); result = 0;
8289 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008290
8291 Py_DECREF(sub_obj);
8292 Py_DECREF(str_obj);
8293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 if (kind1 != kind)
8295 PyMem_Free(buf1);
8296 if (kind2 != kind)
8297 PyMem_Free(buf2);
8298
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 onError:
8301 Py_DECREF(sub_obj);
8302 Py_DECREF(str_obj);
8303 if (kind1 != kind && buf1)
8304 PyMem_Free(buf1);
8305 if (kind2 != kind && buf2)
8306 PyMem_Free(buf2);
8307 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308}
8309
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310Py_ssize_t
8311PyUnicode_Find(PyObject *str,
8312 PyObject *sub,
8313 Py_ssize_t start,
8314 Py_ssize_t end,
8315 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008317 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008318
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008322 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 Py_DECREF(str);
8325 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
Tim Petersced69f82003-09-16 20:30:58 +00008327
Thomas Wouters477c8d52006-05-27 19:21:47 +00008328 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 result = any_find_slice(
8330 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8331 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008332 );
8333 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 result = any_find_slice(
8335 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8336 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008337 );
8338
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008340 Py_DECREF(sub);
8341
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 return result;
8343}
8344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345Py_ssize_t
8346PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8347 Py_ssize_t start, Py_ssize_t end,
8348 int direction)
8349{
8350 char *result;
8351 int kind;
8352 if (PyUnicode_READY(str) == -1)
8353 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008354 if (start < 0 || end < 0) {
8355 PyErr_SetString(PyExc_IndexError, "string index out of range");
8356 return -2;
8357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 if (end > PyUnicode_GET_LENGTH(str))
8359 end = PyUnicode_GET_LENGTH(str);
8360 kind = PyUnicode_KIND(str);
8361 result = findchar(PyUnicode_1BYTE_DATA(str)
8362 + PyUnicode_KIND_SIZE(kind, start),
8363 kind,
8364 end-start, ch, direction);
8365 if (!result)
8366 return -1;
8367 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8368}
8369
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static int
8371tailmatch(PyUnicodeObject *self,
8372 PyUnicodeObject *substring,
8373 Py_ssize_t start,
8374 Py_ssize_t end,
8375 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 int kind_self;
8378 int kind_sub;
8379 void *data_self;
8380 void *data_sub;
8381 Py_ssize_t offset;
8382 Py_ssize_t i;
8383 Py_ssize_t end_sub;
8384
8385 if (PyUnicode_READY(self) == -1 ||
8386 PyUnicode_READY(substring) == -1)
8387 return 0;
8388
8389 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 return 1;
8391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8393 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 kind_self = PyUnicode_KIND(self);
8398 data_self = PyUnicode_DATA(self);
8399 kind_sub = PyUnicode_KIND(substring);
8400 data_sub = PyUnicode_DATA(substring);
8401 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8402
8403 if (direction > 0)
8404 offset = end;
8405 else
8406 offset = start;
8407
8408 if (PyUnicode_READ(kind_self, data_self, offset) ==
8409 PyUnicode_READ(kind_sub, data_sub, 0) &&
8410 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8411 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8412 /* If both are of the same kind, memcmp is sufficient */
8413 if (kind_self == kind_sub) {
8414 return ! memcmp((char *)data_self +
8415 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8416 data_sub,
8417 PyUnicode_GET_LENGTH(substring) *
8418 PyUnicode_CHARACTER_SIZE(substring));
8419 }
8420 /* otherwise we have to compare each character by first accesing it */
8421 else {
8422 /* We do not need to compare 0 and len(substring)-1 because
8423 the if statement above ensured already that they are equal
8424 when we end up here. */
8425 // TODO: honor direction and do a forward or backwards search
8426 for (i = 1; i < end_sub; ++i) {
8427 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8428 PyUnicode_READ(kind_sub, data_sub, i))
8429 return 0;
8430 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 }
8434
8435 return 0;
8436}
8437
Alexander Belopolsky40018472011-02-26 01:02:56 +00008438Py_ssize_t
8439PyUnicode_Tailmatch(PyObject *str,
8440 PyObject *substr,
8441 Py_ssize_t start,
8442 Py_ssize_t end,
8443 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008445 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008446
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 str = PyUnicode_FromObject(str);
8448 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 substr = PyUnicode_FromObject(substr);
8451 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 Py_DECREF(str);
8453 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 }
Tim Petersced69f82003-09-16 20:30:58 +00008455
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 (PyUnicodeObject *)substr,
8458 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459 Py_DECREF(str);
8460 Py_DECREF(substr);
8461 return result;
8462}
8463
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464/* Apply fixfct filter to the Unicode object self and return a
8465 reference to the modified object */
8466
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467static PyObject *
8468fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 PyObject *u;
8472 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 if (PyUnicode_READY(self) == -1)
8475 return NULL;
8476 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8477 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8478 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8483 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 /* fix functions return the new maximum character in a string,
8486 if the kind of the resulting unicode object does not change,
8487 everything is fine. Otherwise we need to change the string kind
8488 and re-run the fix function. */
8489 maxchar_new = fixfct((PyUnicodeObject*)u);
8490 if (maxchar_new == 0)
8491 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8492 else if (maxchar_new <= 127)
8493 maxchar_new = 127;
8494 else if (maxchar_new <= 255)
8495 maxchar_new = 255;
8496 else if (maxchar_new <= 65535)
8497 maxchar_new = 65535;
8498 else
8499 maxchar_new = 1114111; /* 0x10ffff */
8500
8501 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 /* fixfct should return TRUE if it modified the buffer. If
8503 FALSE, return a reference to the original buffer instead
8504 (to save space, not time) */
8505 Py_INCREF(self);
8506 Py_DECREF(u);
8507 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 else if (maxchar_new == maxchar_old) {
8510 return u;
8511 }
8512 else {
8513 /* In case the maximum character changed, we need to
8514 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008515 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 if (v == NULL) {
8517 Py_DECREF(u);
8518 return NULL;
8519 }
8520 if (maxchar_new > maxchar_old) {
8521 /* If the maxchar increased so that the kind changed, not all
8522 characters are representable anymore and we need to fix the
8523 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008524 if (PyUnicode_CopyCharacters(v, 0,
8525 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008526 PyUnicode_GET_LENGTH(self)) < 0)
8527 {
8528 Py_DECREF(u);
8529 return NULL;
8530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 maxchar_old = fixfct((PyUnicodeObject*)v);
8532 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8533 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008534 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008535 if (PyUnicode_CopyCharacters(v, 0,
8536 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008537 PyUnicode_GET_LENGTH(self)) < 0)
8538 {
8539 Py_DECREF(u);
8540 return NULL;
8541 }
8542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543
8544 Py_DECREF(u);
8545 return v;
8546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547}
8548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008550fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552 /* No need to call PyUnicode_READY(self) because this function is only
8553 called as a callback from fixup() which does it already. */
8554 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8555 const int kind = PyUnicode_KIND(self);
8556 void *data = PyUnicode_DATA(self);
8557 int touched = 0;
8558 Py_UCS4 maxchar = 0;
8559 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 for (i = 0; i < len; ++i) {
8562 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8563 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8564 if (up != ch) {
8565 if (up > maxchar)
8566 maxchar = up;
8567 PyUnicode_WRITE(kind, data, i, up);
8568 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 else if (ch > maxchar)
8571 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 }
8573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 if (touched)
8575 return maxchar;
8576 else
8577 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578}
8579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008581fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8584 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8585 const int kind = PyUnicode_KIND(self);
8586 void *data = PyUnicode_DATA(self);
8587 int touched = 0;
8588 Py_UCS4 maxchar = 0;
8589 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 for(i = 0; i < len; ++i) {
8592 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8593 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8594 if (lo != ch) {
8595 if (lo > maxchar)
8596 maxchar = lo;
8597 PyUnicode_WRITE(kind, data, i, lo);
8598 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 else if (ch > maxchar)
8601 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 }
8603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 if (touched)
8605 return maxchar;
8606 else
8607 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608}
8609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008611fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8614 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8615 const int kind = PyUnicode_KIND(self);
8616 void *data = PyUnicode_DATA(self);
8617 int touched = 0;
8618 Py_UCS4 maxchar = 0;
8619 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 for(i = 0; i < len; ++i) {
8622 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8623 Py_UCS4 nu = 0;
8624
8625 if (Py_UNICODE_ISUPPER(ch))
8626 nu = Py_UNICODE_TOLOWER(ch);
8627 else if (Py_UNICODE_ISLOWER(ch))
8628 nu = Py_UNICODE_TOUPPER(ch);
8629
8630 if (nu != 0) {
8631 if (nu > maxchar)
8632 maxchar = nu;
8633 PyUnicode_WRITE(kind, data, i, nu);
8634 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 else if (ch > maxchar)
8637 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 }
8639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 if (touched)
8641 return maxchar;
8642 else
8643 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644}
8645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008647fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8650 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8651 const int kind = PyUnicode_KIND(self);
8652 void *data = PyUnicode_DATA(self);
8653 int touched = 0;
8654 Py_UCS4 maxchar = 0;
8655 Py_ssize_t i = 0;
8656 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008657
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008658 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660
8661 ch = PyUnicode_READ(kind, data, i);
8662 if (!Py_UNICODE_ISUPPER(ch)) {
8663 maxchar = Py_UNICODE_TOUPPER(ch);
8664 PyUnicode_WRITE(kind, data, i, maxchar);
8665 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 ++i;
8668 for(; i < len; ++i) {
8669 ch = PyUnicode_READ(kind, data, i);
8670 if (!Py_UNICODE_ISLOWER(ch)) {
8671 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8672 if (lo > maxchar)
8673 maxchar = lo;
8674 PyUnicode_WRITE(kind, data, i, lo);
8675 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 else if (ch > maxchar)
8678 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680
8681 if (touched)
8682 return maxchar;
8683 else
8684 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685}
8686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8691 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8692 const int kind = PyUnicode_KIND(self);
8693 void *data = PyUnicode_DATA(self);
8694 Py_UCS4 maxchar = 0;
8695 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 int previous_is_cased;
8697
8698 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 if (len == 1) {
8700 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8701 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8702 if (ti != ch) {
8703 PyUnicode_WRITE(kind, data, i, ti);
8704 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 }
8706 else
8707 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 for(; i < len; ++i) {
8711 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8712 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008713
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717 nu = Py_UNICODE_TOTITLE(ch);
8718
8719 if (nu > maxchar)
8720 maxchar = nu;
8721 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008722
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 if (Py_UNICODE_ISLOWER(ch) ||
8724 Py_UNICODE_ISUPPER(ch) ||
8725 Py_UNICODE_ISTITLE(ch))
8726 previous_is_cased = 1;
8727 else
8728 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731}
8732
Tim Peters8ce9f162004-08-27 01:49:32 +00008733PyObject *
8734PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008737 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008739 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008740 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8741 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008742 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 Py_ssize_t sz, i, res_offset;
8744 Py_UCS4 maxchar = 0;
8745 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746
Tim Peters05eba1f2004-08-27 21:32:02 +00008747 fseq = PySequence_Fast(seq, "");
8748 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008750 }
8751
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008752 /* NOTE: the following code can't call back into Python code,
8753 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008754 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008755
Tim Peters05eba1f2004-08-27 21:32:02 +00008756 seqlen = PySequence_Fast_GET_SIZE(fseq);
8757 /* If empty sequence, return u"". */
8758 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008760 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008761 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008762 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008763 /* If singleton sequence with an exact Unicode, return that. */
8764 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 item = items[0];
8766 if (PyUnicode_CheckExact(item)) {
8767 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 goto Done;
8770 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008771 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008772 else {
8773 /* Set up sep and seplen */
8774 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 /* fall back to a blank space separator */
8776 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008777 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008779 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008780 else {
8781 if (!PyUnicode_Check(separator)) {
8782 PyErr_Format(PyExc_TypeError,
8783 "separator: expected str instance,"
8784 " %.80s found",
8785 Py_TYPE(separator)->tp_name);
8786 goto onError;
8787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 if (PyUnicode_READY(separator) == -1)
8789 goto onError;
8790 sep = separator;
8791 seplen = PyUnicode_GET_LENGTH(separator);
8792 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8793 /* inc refcount to keep this code path symetric with the
8794 above case of a blank separator */
8795 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008796 }
8797 }
8798
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008799 /* There are at least two things to join, or else we have a subclass
8800 * of str in the sequence.
8801 * Do a pre-pass to figure out the total amount of space we'll
8802 * need (sz), and see whether all argument are strings.
8803 */
8804 sz = 0;
8805 for (i = 0; i < seqlen; i++) {
8806 const Py_ssize_t old_sz = sz;
8807 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 if (!PyUnicode_Check(item)) {
8809 PyErr_Format(PyExc_TypeError,
8810 "sequence item %zd: expected str instance,"
8811 " %.80s found",
8812 i, Py_TYPE(item)->tp_name);
8813 goto onError;
8814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 if (PyUnicode_READY(item) == -1)
8816 goto onError;
8817 sz += PyUnicode_GET_LENGTH(item);
8818 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8819 if (item_maxchar > maxchar)
8820 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008821 if (i != 0)
8822 sz += seplen;
8823 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8824 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008826 goto onError;
8827 }
8828 }
Tim Petersced69f82003-09-16 20:30:58 +00008829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008831 if (res == NULL)
8832 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008833
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008834 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008836 Py_ssize_t itemlen;
8837 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 /* Copy item, and maybe the separator. */
8840 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008841 if (PyUnicode_CopyCharacters(res, res_offset,
8842 sep, 0, seplen) < 0)
8843 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008846 if (PyUnicode_CopyCharacters(res, res_offset,
8847 item, 0, itemlen) < 0)
8848 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008852
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008854 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 Py_XDECREF(sep);
8856 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008859 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008861 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862 return NULL;
8863}
8864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865#define FILL(kind, data, value, start, length) \
8866 do { \
8867 Py_ssize_t i_ = 0; \
8868 assert(kind != PyUnicode_WCHAR_KIND); \
8869 switch ((kind)) { \
8870 case PyUnicode_1BYTE_KIND: { \
8871 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8872 memset(to_, (unsigned char)value, length); \
8873 break; \
8874 } \
8875 case PyUnicode_2BYTE_KIND: { \
8876 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8877 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8878 break; \
8879 } \
8880 default: { \
8881 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8882 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8883 break; \
8884 } \
8885 } \
8886 } while (0)
8887
Alexander Belopolsky40018472011-02-26 01:02:56 +00008888static PyUnicodeObject *
8889pad(PyUnicodeObject *self,
8890 Py_ssize_t left,
8891 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 PyObject *u;
8895 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008896 int kind;
8897 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898
8899 if (left < 0)
8900 left = 0;
8901 if (right < 0)
8902 right = 0;
8903
Tim Peters7a29bd52001-09-12 03:03:31 +00008904 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 Py_INCREF(self);
8906 return self;
8907 }
8908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8910 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008911 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8912 return NULL;
8913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8915 if (fill > maxchar)
8916 maxchar = fill;
8917 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008918 if (!u)
8919 return NULL;
8920
8921 kind = PyUnicode_KIND(u);
8922 data = PyUnicode_DATA(u);
8923 if (left)
8924 FILL(kind, data, fill, 0, left);
8925 if (right)
8926 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008927 if (PyUnicode_CopyCharacters(u, left,
8928 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008929 _PyUnicode_LENGTH(self)) < 0)
8930 {
8931 Py_DECREF(u);
8932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 }
8934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938
Alexander Belopolsky40018472011-02-26 01:02:56 +00008939PyObject *
8940PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943
8944 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 switch(PyUnicode_KIND(string)) {
8949 case PyUnicode_1BYTE_KIND:
8950 list = ucs1lib_splitlines(
8951 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8952 PyUnicode_GET_LENGTH(string), keepends);
8953 break;
8954 case PyUnicode_2BYTE_KIND:
8955 list = ucs2lib_splitlines(
8956 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8957 PyUnicode_GET_LENGTH(string), keepends);
8958 break;
8959 case PyUnicode_4BYTE_KIND:
8960 list = ucs4lib_splitlines(
8961 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8962 PyUnicode_GET_LENGTH(string), keepends);
8963 break;
8964 default:
8965 assert(0);
8966 list = 0;
8967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968 Py_DECREF(string);
8969 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970}
8971
Alexander Belopolsky40018472011-02-26 01:02:56 +00008972static PyObject *
8973split(PyUnicodeObject *self,
8974 PyUnicodeObject *substring,
8975 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 int kind1, kind2, kind;
8978 void *buf1, *buf2;
8979 Py_ssize_t len1, len2;
8980 PyObject* out;
8981
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008983 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 if (PyUnicode_READY(self) == -1)
8986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 if (substring == NULL)
8989 switch(PyUnicode_KIND(self)) {
8990 case PyUnicode_1BYTE_KIND:
8991 return ucs1lib_split_whitespace(
8992 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8993 PyUnicode_GET_LENGTH(self), maxcount
8994 );
8995 case PyUnicode_2BYTE_KIND:
8996 return ucs2lib_split_whitespace(
8997 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8998 PyUnicode_GET_LENGTH(self), maxcount
8999 );
9000 case PyUnicode_4BYTE_KIND:
9001 return ucs4lib_split_whitespace(
9002 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9003 PyUnicode_GET_LENGTH(self), maxcount
9004 );
9005 default:
9006 assert(0);
9007 return NULL;
9008 }
9009
9010 if (PyUnicode_READY(substring) == -1)
9011 return NULL;
9012
9013 kind1 = PyUnicode_KIND(self);
9014 kind2 = PyUnicode_KIND(substring);
9015 kind = kind1 > kind2 ? kind1 : kind2;
9016 buf1 = PyUnicode_DATA(self);
9017 buf2 = PyUnicode_DATA(substring);
9018 if (kind1 != kind)
9019 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9020 if (!buf1)
9021 return NULL;
9022 if (kind2 != kind)
9023 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9024 if (!buf2) {
9025 if (kind1 != kind) PyMem_Free(buf1);
9026 return NULL;
9027 }
9028 len1 = PyUnicode_GET_LENGTH(self);
9029 len2 = PyUnicode_GET_LENGTH(substring);
9030
9031 switch(kind) {
9032 case PyUnicode_1BYTE_KIND:
9033 out = ucs1lib_split(
9034 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9035 break;
9036 case PyUnicode_2BYTE_KIND:
9037 out = ucs2lib_split(
9038 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9039 break;
9040 case PyUnicode_4BYTE_KIND:
9041 out = ucs4lib_split(
9042 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9043 break;
9044 default:
9045 out = NULL;
9046 }
9047 if (kind1 != kind)
9048 PyMem_Free(buf1);
9049 if (kind2 != kind)
9050 PyMem_Free(buf2);
9051 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052}
9053
Alexander Belopolsky40018472011-02-26 01:02:56 +00009054static PyObject *
9055rsplit(PyUnicodeObject *self,
9056 PyUnicodeObject *substring,
9057 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 int kind1, kind2, kind;
9060 void *buf1, *buf2;
9061 Py_ssize_t len1, len2;
9062 PyObject* out;
9063
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009064 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009065 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (PyUnicode_READY(self) == -1)
9068 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 if (substring == NULL)
9071 switch(PyUnicode_KIND(self)) {
9072 case PyUnicode_1BYTE_KIND:
9073 return ucs1lib_rsplit_whitespace(
9074 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9075 PyUnicode_GET_LENGTH(self), maxcount
9076 );
9077 case PyUnicode_2BYTE_KIND:
9078 return ucs2lib_rsplit_whitespace(
9079 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9080 PyUnicode_GET_LENGTH(self), maxcount
9081 );
9082 case PyUnicode_4BYTE_KIND:
9083 return ucs4lib_rsplit_whitespace(
9084 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9085 PyUnicode_GET_LENGTH(self), maxcount
9086 );
9087 default:
9088 assert(0);
9089 return NULL;
9090 }
9091
9092 if (PyUnicode_READY(substring) == -1)
9093 return NULL;
9094
9095 kind1 = PyUnicode_KIND(self);
9096 kind2 = PyUnicode_KIND(substring);
9097 kind = kind1 > kind2 ? kind1 : kind2;
9098 buf1 = PyUnicode_DATA(self);
9099 buf2 = PyUnicode_DATA(substring);
9100 if (kind1 != kind)
9101 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9102 if (!buf1)
9103 return NULL;
9104 if (kind2 != kind)
9105 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9106 if (!buf2) {
9107 if (kind1 != kind) PyMem_Free(buf1);
9108 return NULL;
9109 }
9110 len1 = PyUnicode_GET_LENGTH(self);
9111 len2 = PyUnicode_GET_LENGTH(substring);
9112
9113 switch(kind) {
9114 case PyUnicode_1BYTE_KIND:
9115 out = ucs1lib_rsplit(
9116 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9117 break;
9118 case PyUnicode_2BYTE_KIND:
9119 out = ucs2lib_rsplit(
9120 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9121 break;
9122 case PyUnicode_4BYTE_KIND:
9123 out = ucs4lib_rsplit(
9124 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9125 break;
9126 default:
9127 out = NULL;
9128 }
9129 if (kind1 != kind)
9130 PyMem_Free(buf1);
9131 if (kind2 != kind)
9132 PyMem_Free(buf2);
9133 return out;
9134}
9135
9136static Py_ssize_t
9137anylib_find(int kind, void *buf1, Py_ssize_t len1,
9138 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9139{
9140 switch(kind) {
9141 case PyUnicode_1BYTE_KIND:
9142 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9143 case PyUnicode_2BYTE_KIND:
9144 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9145 case PyUnicode_4BYTE_KIND:
9146 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9147 }
9148 assert(0);
9149 return -1;
9150}
9151
9152static Py_ssize_t
9153anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9154 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9155{
9156 switch(kind) {
9157 case PyUnicode_1BYTE_KIND:
9158 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9159 case PyUnicode_2BYTE_KIND:
9160 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9161 case PyUnicode_4BYTE_KIND:
9162 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9163 }
9164 assert(0);
9165 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009166}
9167
Alexander Belopolsky40018472011-02-26 01:02:56 +00009168static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169replace(PyObject *self, PyObject *str1,
9170 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 PyObject *u;
9173 char *sbuf = PyUnicode_DATA(self);
9174 char *buf1 = PyUnicode_DATA(str1);
9175 char *buf2 = PyUnicode_DATA(str2);
9176 int srelease = 0, release1 = 0, release2 = 0;
9177 int skind = PyUnicode_KIND(self);
9178 int kind1 = PyUnicode_KIND(str1);
9179 int kind2 = PyUnicode_KIND(str2);
9180 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9181 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9182 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
9184 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009185 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009187 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 if (skind < kind1)
9190 /* substring too wide to be present */
9191 goto nothing;
9192
9193 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009194 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009195 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009197 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009199 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 Py_UCS4 u1, u2, maxchar;
9201 int mayshrink, rkind;
9202 u1 = PyUnicode_READ_CHAR(str1, 0);
9203 if (!findchar(sbuf, PyUnicode_KIND(self),
9204 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009205 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 u2 = PyUnicode_READ_CHAR(str2, 0);
9207 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9208 /* Replacing u1 with u2 may cause a maxchar reduction in the
9209 result string. */
9210 mayshrink = maxchar > 127;
9211 if (u2 > maxchar) {
9212 maxchar = u2;
9213 mayshrink = 0;
9214 }
9215 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009216 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009218 if (PyUnicode_CopyCharacters(u, 0,
9219 (PyObject*)self, 0, slen) < 0)
9220 {
9221 Py_DECREF(u);
9222 return NULL;
9223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 rkind = PyUnicode_KIND(u);
9225 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9226 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009227 if (--maxcount < 0)
9228 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 if (mayshrink) {
9232 PyObject *tmp = u;
9233 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9234 PyUnicode_GET_LENGTH(tmp));
9235 Py_DECREF(tmp);
9236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 int rkind = skind;
9239 char *res;
9240 if (kind1 < rkind) {
9241 /* widen substring */
9242 buf1 = _PyUnicode_AsKind(str1, rkind);
9243 if (!buf1) goto error;
9244 release1 = 1;
9245 }
9246 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009247 if (i < 0)
9248 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 if (rkind > kind2) {
9250 /* widen replacement */
9251 buf2 = _PyUnicode_AsKind(str2, rkind);
9252 if (!buf2) goto error;
9253 release2 = 1;
9254 }
9255 else if (rkind < kind2) {
9256 /* widen self and buf1 */
9257 rkind = kind2;
9258 if (release1) PyMem_Free(buf1);
9259 sbuf = _PyUnicode_AsKind(self, rkind);
9260 if (!sbuf) goto error;
9261 srelease = 1;
9262 buf1 = _PyUnicode_AsKind(str1, rkind);
9263 if (!buf1) goto error;
9264 release1 = 1;
9265 }
9266 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9267 if (!res) {
9268 PyErr_NoMemory();
9269 goto error;
9270 }
9271 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009272 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9274 buf2,
9275 PyUnicode_KIND_SIZE(rkind, len2));
9276 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009277
9278 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9280 slen-i,
9281 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009282 if (i == -1)
9283 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9285 buf2,
9286 PyUnicode_KIND_SIZE(rkind, len2));
9287 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289
9290 u = PyUnicode_FromKindAndData(rkind, res, slen);
9291 PyMem_Free(res);
9292 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 Py_ssize_t n, i, j, ires;
9297 Py_ssize_t product, new_size;
9298 int rkind = skind;
9299 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 if (kind1 < rkind) {
9302 buf1 = _PyUnicode_AsKind(str1, rkind);
9303 if (!buf1) goto error;
9304 release1 = 1;
9305 }
9306 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009307 if (n == 0)
9308 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 if (kind2 < rkind) {
9310 buf2 = _PyUnicode_AsKind(str2, rkind);
9311 if (!buf2) goto error;
9312 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 else if (kind2 > rkind) {
9315 rkind = kind2;
9316 sbuf = _PyUnicode_AsKind(self, rkind);
9317 if (!sbuf) goto error;
9318 srelease = 1;
9319 if (release1) PyMem_Free(buf1);
9320 buf1 = _PyUnicode_AsKind(str1, rkind);
9321 if (!buf1) goto error;
9322 release1 = 1;
9323 }
9324 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9325 PyUnicode_GET_LENGTH(str1))); */
9326 product = n * (len2-len1);
9327 if ((product / (len2-len1)) != n) {
9328 PyErr_SetString(PyExc_OverflowError,
9329 "replace string is too long");
9330 goto error;
9331 }
9332 new_size = slen + product;
9333 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9334 PyErr_SetString(PyExc_OverflowError,
9335 "replace string is too long");
9336 goto error;
9337 }
9338 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9339 if (!res)
9340 goto error;
9341 ires = i = 0;
9342 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009343 while (n-- > 0) {
9344 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 j = anylib_find(rkind,
9346 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9347 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009348 if (j == -1)
9349 break;
9350 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009351 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9353 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9354 PyUnicode_KIND_SIZE(rkind, j-i));
9355 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009356 }
9357 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 if (len2 > 0) {
9359 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9360 buf2,
9361 PyUnicode_KIND_SIZE(rkind, len2));
9362 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009367 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9369 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9370 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009371 } else {
9372 /* interleave */
9373 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9375 buf2,
9376 PyUnicode_KIND_SIZE(rkind, len2));
9377 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009378 if (--n <= 0)
9379 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9381 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9382 PyUnicode_KIND_SIZE(rkind, 1));
9383 ires++;
9384 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9387 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9388 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009391 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 if (srelease)
9394 PyMem_FREE(sbuf);
9395 if (release1)
9396 PyMem_FREE(buf1);
9397 if (release2)
9398 PyMem_FREE(buf2);
9399 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009400
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009402 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 if (srelease)
9404 PyMem_FREE(sbuf);
9405 if (release1)
9406 PyMem_FREE(buf1);
9407 if (release2)
9408 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009409 if (PyUnicode_CheckExact(self)) {
9410 Py_INCREF(self);
9411 return (PyObject *) self;
9412 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009413 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 error:
9415 if (srelease && sbuf)
9416 PyMem_FREE(sbuf);
9417 if (release1 && buf1)
9418 PyMem_FREE(buf1);
9419 if (release2 && buf2)
9420 PyMem_FREE(buf2);
9421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422}
9423
9424/* --- Unicode Object Methods --------------------------------------------- */
9425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009426PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009427 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428\n\
9429Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009430characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431
9432static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009433unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 return fixup(self, fixtitle);
9436}
9437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009438PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009439 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440\n\
9441Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009442have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443
9444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009445unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447 return fixup(self, fixcapitalize);
9448}
9449
9450#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009451PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009452 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453\n\
9454Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009455normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456
9457static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009458unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459{
9460 PyObject *list;
9461 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009462 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 /* Split into words */
9465 list = split(self, NULL, -1);
9466 if (!list)
9467 return NULL;
9468
9469 /* Capitalize each word */
9470 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9471 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 if (item == NULL)
9474 goto onError;
9475 Py_DECREF(PyList_GET_ITEM(list, i));
9476 PyList_SET_ITEM(list, i, item);
9477 }
9478
9479 /* Join the words to form a new string */
9480 item = PyUnicode_Join(NULL, list);
9481
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 Py_DECREF(list);
9484 return (PyObject *)item;
9485}
9486#endif
9487
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009488/* Argument converter. Coerces to a single unicode character */
9489
9490static int
9491convert_uc(PyObject *obj, void *addr)
9492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009494 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009495
Benjamin Peterson14339b62009-01-31 16:36:08 +00009496 uniobj = PyUnicode_FromObject(obj);
9497 if (uniobj == NULL) {
9498 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009500 return 0;
9501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009503 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009505 Py_DECREF(uniobj);
9506 return 0;
9507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009509 Py_DECREF(uniobj);
9510 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009511}
9512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009513PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009516Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009517done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518
9519static PyObject *
9520unicode_center(PyUnicodeObject *self, PyObject *args)
9521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009522 Py_ssize_t marg, left;
9523 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 Py_UCS4 fillchar = ' ';
9525
Victor Stinnere9a29352011-10-01 02:14:59 +02009526 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528
Victor Stinnere9a29352011-10-01 02:14:59 +02009529 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 return NULL;
9531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 Py_INCREF(self);
9534 return (PyObject*) self;
9535 }
9536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 left = marg / 2 + (marg & width & 1);
9539
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009540 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541}
9542
Marc-André Lemburge5034372000-08-08 08:04:29 +00009543#if 0
9544
9545/* This code should go into some future Unicode collation support
9546 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009547 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009548
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009549/* speedy UTF-16 code point order comparison */
9550/* gleaned from: */
9551/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9552
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009553static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009554{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009555 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009556 0, 0, 0, 0, 0, 0, 0, 0,
9557 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009558 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009559};
9560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561static int
9562unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9563{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009564 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009565
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566 Py_UNICODE *s1 = str1->str;
9567 Py_UNICODE *s2 = str2->str;
9568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 len1 = str1->_base._base.length;
9570 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009571
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009573 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009574
9575 c1 = *s1++;
9576 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009577
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 if (c1 > (1<<11) * 26)
9579 c1 += utf16Fixup[c1>>11];
9580 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009581 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009582 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009583
9584 if (c1 != c2)
9585 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009586
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009587 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 }
9589
9590 return (len1 < len2) ? -1 : (len1 != len2);
9591}
9592
Marc-André Lemburge5034372000-08-08 08:04:29 +00009593#else
9594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595/* This function assumes that str1 and str2 are readied by the caller. */
9596
Marc-André Lemburge5034372000-08-08 08:04:29 +00009597static int
9598unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 int kind1, kind2;
9601 void *data1, *data2;
9602 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 kind1 = PyUnicode_KIND(str1);
9605 kind2 = PyUnicode_KIND(str2);
9606 data1 = PyUnicode_DATA(str1);
9607 data2 = PyUnicode_DATA(str2);
9608 len1 = PyUnicode_GET_LENGTH(str1);
9609 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 for (i = 0; i < len1 && i < len2; ++i) {
9612 Py_UCS4 c1, c2;
9613 c1 = PyUnicode_READ(kind1, data1, i);
9614 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009615
9616 if (c1 != c2)
9617 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009618 }
9619
9620 return (len1 < len2) ? -1 : (len1 != len2);
9621}
9622
9623#endif
9624
Alexander Belopolsky40018472011-02-26 01:02:56 +00009625int
9626PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9629 if (PyUnicode_READY(left) == -1 ||
9630 PyUnicode_READY(right) == -1)
9631 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009632 return unicode_compare((PyUnicodeObject *)left,
9633 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009635 PyErr_Format(PyExc_TypeError,
9636 "Can't compare %.100s and %.100s",
9637 left->ob_type->tp_name,
9638 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 return -1;
9640}
9641
Martin v. Löwis5b222132007-06-10 09:51:05 +00009642int
9643PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 Py_ssize_t i;
9646 int kind;
9647 void *data;
9648 Py_UCS4 chr;
9649
Victor Stinner910337b2011-10-03 03:20:16 +02009650 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 if (PyUnicode_READY(uni) == -1)
9652 return -1;
9653 kind = PyUnicode_KIND(uni);
9654 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009655 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9657 if (chr != str[i])
9658 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009659 /* This check keeps Python strings that end in '\0' from comparing equal
9660 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009663 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009665 return 0;
9666}
9667
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009668
Benjamin Peterson29060642009-01-31 22:14:21 +00009669#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009670 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009671
Alexander Belopolsky40018472011-02-26 01:02:56 +00009672PyObject *
9673PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009674{
9675 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009676
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009677 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9678 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 if (PyUnicode_READY(left) == -1 ||
9680 PyUnicode_READY(right) == -1)
9681 return NULL;
9682 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9683 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009684 if (op == Py_EQ) {
9685 Py_INCREF(Py_False);
9686 return Py_False;
9687 }
9688 if (op == Py_NE) {
9689 Py_INCREF(Py_True);
9690 return Py_True;
9691 }
9692 }
9693 if (left == right)
9694 result = 0;
9695 else
9696 result = unicode_compare((PyUnicodeObject *)left,
9697 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009698
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009699 /* Convert the return value to a Boolean */
9700 switch (op) {
9701 case Py_EQ:
9702 v = TEST_COND(result == 0);
9703 break;
9704 case Py_NE:
9705 v = TEST_COND(result != 0);
9706 break;
9707 case Py_LE:
9708 v = TEST_COND(result <= 0);
9709 break;
9710 case Py_GE:
9711 v = TEST_COND(result >= 0);
9712 break;
9713 case Py_LT:
9714 v = TEST_COND(result == -1);
9715 break;
9716 case Py_GT:
9717 v = TEST_COND(result == 1);
9718 break;
9719 default:
9720 PyErr_BadArgument();
9721 return NULL;
9722 }
9723 Py_INCREF(v);
9724 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009726
Brian Curtindfc80e32011-08-10 20:28:54 -05009727 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009728}
9729
Alexander Belopolsky40018472011-02-26 01:02:56 +00009730int
9731PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009732{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009733 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 int kind1, kind2, kind;
9735 void *buf1, *buf2;
9736 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009737 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009738
9739 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009740 sub = PyUnicode_FromObject(element);
9741 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009742 PyErr_Format(PyExc_TypeError,
9743 "'in <string>' requires string as left operand, not %s",
9744 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009745 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 if (PyUnicode_READY(sub) == -1)
9748 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009749
Thomas Wouters477c8d52006-05-27 19:21:47 +00009750 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009751 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009752 Py_DECREF(sub);
9753 return -1;
9754 }
9755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 kind1 = PyUnicode_KIND(str);
9757 kind2 = PyUnicode_KIND(sub);
9758 kind = kind1 > kind2 ? kind1 : kind2;
9759 buf1 = PyUnicode_DATA(str);
9760 buf2 = PyUnicode_DATA(sub);
9761 if (kind1 != kind)
9762 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9763 if (!buf1) {
9764 Py_DECREF(sub);
9765 return -1;
9766 }
9767 if (kind2 != kind)
9768 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9769 if (!buf2) {
9770 Py_DECREF(sub);
9771 if (kind1 != kind) PyMem_Free(buf1);
9772 return -1;
9773 }
9774 len1 = PyUnicode_GET_LENGTH(str);
9775 len2 = PyUnicode_GET_LENGTH(sub);
9776
9777 switch(kind) {
9778 case PyUnicode_1BYTE_KIND:
9779 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9780 break;
9781 case PyUnicode_2BYTE_KIND:
9782 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9783 break;
9784 case PyUnicode_4BYTE_KIND:
9785 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9786 break;
9787 default:
9788 result = -1;
9789 assert(0);
9790 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009791
9792 Py_DECREF(str);
9793 Py_DECREF(sub);
9794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 if (kind1 != kind)
9796 PyMem_Free(buf1);
9797 if (kind2 != kind)
9798 PyMem_Free(buf2);
9799
Guido van Rossum403d68b2000-03-13 15:55:09 +00009800 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009801}
9802
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803/* Concat to string or Unicode object giving a new Unicode object. */
9804
Alexander Belopolsky40018472011-02-26 01:02:56 +00009805PyObject *
9806PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 PyObject *u = NULL, *v = NULL, *w;
9809 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810
9811 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009817 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818
9819 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009820 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009821 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009824 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009825 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 }
9828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009830 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 w = PyUnicode_New(
9834 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9835 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009837 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009838 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9839 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009840 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009841 v, 0,
9842 PyUnicode_GET_LENGTH(v)) < 0)
9843 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844 Py_DECREF(u);
9845 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849 Py_XDECREF(u);
9850 Py_XDECREF(v);
9851 return NULL;
9852}
9853
Walter Dörwald1ab83302007-05-18 17:15:44 +00009854void
Victor Stinner23e56682011-10-03 03:54:37 +02009855PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009856{
Victor Stinner23e56682011-10-03 03:54:37 +02009857 PyObject *left, *res;
9858
9859 if (p_left == NULL) {
9860 if (!PyErr_Occurred())
9861 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009862 return;
9863 }
Victor Stinner23e56682011-10-03 03:54:37 +02009864 left = *p_left;
9865 if (right == NULL || !PyUnicode_Check(left)) {
9866 if (!PyErr_Occurred())
9867 PyErr_BadInternalCall();
9868 goto error;
9869 }
9870
9871 if (PyUnicode_CheckExact(left) && left != unicode_empty
9872 && PyUnicode_CheckExact(right) && right != unicode_empty
9873 && unicode_resizable(left)
9874 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9875 || _PyUnicode_WSTR(left) != NULL))
9876 {
9877 Py_ssize_t u_len, v_len, new_len, copied;
9878
9879 /* FIXME: don't make wstr string ready */
9880 if (PyUnicode_READY(left))
9881 goto error;
9882 if (PyUnicode_READY(right))
9883 goto error;
9884
9885 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9886 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9887 {
9888 u_len = PyUnicode_GET_LENGTH(left);
9889 v_len = PyUnicode_GET_LENGTH(right);
9890 if (u_len > PY_SSIZE_T_MAX - v_len) {
9891 PyErr_SetString(PyExc_OverflowError,
9892 "strings are too large to concat");
9893 goto error;
9894 }
9895 new_len = u_len + v_len;
9896
9897 /* Now we own the last reference to 'left', so we can resize it
9898 * in-place.
9899 */
9900 if (unicode_resize(&left, new_len) != 0) {
9901 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9902 * deallocated so it cannot be put back into
9903 * 'variable'. The MemoryError is raised when there
9904 * is no value in 'variable', which might (very
9905 * remotely) be a cause of incompatibilities.
9906 */
9907 goto error;
9908 }
9909 /* copy 'right' into the newly allocated area of 'left' */
9910 copied = PyUnicode_CopyCharacters(left, u_len,
9911 right, 0,
9912 v_len);
9913 assert(0 <= copied);
9914 *p_left = left;
9915 return;
9916 }
9917 }
9918
9919 res = PyUnicode_Concat(left, right);
9920 if (res == NULL)
9921 goto error;
9922 Py_DECREF(left);
9923 *p_left = res;
9924 return;
9925
9926error:
9927 Py_DECREF(*p_left);
9928 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009929}
9930
9931void
9932PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9933{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 PyUnicode_Append(pleft, right);
9935 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009936}
9937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009938PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009941Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009942string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009943interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944
9945static PyObject *
9946unicode_count(PyUnicodeObject *self, PyObject *args)
9947{
9948 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009949 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009950 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 int kind1, kind2, kind;
9953 void *buf1, *buf2;
9954 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
Jesus Ceaac451502011-04-20 17:09:23 +02009956 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9957 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009958 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 kind1 = PyUnicode_KIND(self);
9961 kind2 = PyUnicode_KIND(substring);
9962 kind = kind1 > kind2 ? kind1 : kind2;
9963 buf1 = PyUnicode_DATA(self);
9964 buf2 = PyUnicode_DATA(substring);
9965 if (kind1 != kind)
9966 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9967 if (!buf1) {
9968 Py_DECREF(substring);
9969 return NULL;
9970 }
9971 if (kind2 != kind)
9972 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9973 if (!buf2) {
9974 Py_DECREF(substring);
9975 if (kind1 != kind) PyMem_Free(buf1);
9976 return NULL;
9977 }
9978 len1 = PyUnicode_GET_LENGTH(self);
9979 len2 = PyUnicode_GET_LENGTH(substring);
9980
9981 ADJUST_INDICES(start, end, len1);
9982 switch(kind) {
9983 case PyUnicode_1BYTE_KIND:
9984 iresult = ucs1lib_count(
9985 ((Py_UCS1*)buf1) + start, end - start,
9986 buf2, len2, PY_SSIZE_T_MAX
9987 );
9988 break;
9989 case PyUnicode_2BYTE_KIND:
9990 iresult = ucs2lib_count(
9991 ((Py_UCS2*)buf1) + start, end - start,
9992 buf2, len2, PY_SSIZE_T_MAX
9993 );
9994 break;
9995 case PyUnicode_4BYTE_KIND:
9996 iresult = ucs4lib_count(
9997 ((Py_UCS4*)buf1) + start, end - start,
9998 buf2, len2, PY_SSIZE_T_MAX
9999 );
10000 break;
10001 default:
10002 assert(0); iresult = 0;
10003 }
10004
10005 result = PyLong_FromSsize_t(iresult);
10006
10007 if (kind1 != kind)
10008 PyMem_Free(buf1);
10009 if (kind2 != kind)
10010 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011
10012 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010013
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014 return result;
10015}
10016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010017PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010018 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010020Encode S using the codec registered for encoding. Default encoding\n\
10021is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010022handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010023a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10024'xmlcharrefreplace' as well as any other name registered with\n\
10025codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
10027static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010028unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010030 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031 char *encoding = NULL;
10032 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010033
Benjamin Peterson308d6372009-09-18 21:42:35 +000010034 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10035 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010037 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010038}
10039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010040PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042\n\
10043Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010044If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045
10046static PyObject*
10047unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10048{
10049 Py_UNICODE *e;
10050 Py_UNICODE *p;
10051 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010052 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054 PyUnicodeObject *u;
10055 int tabsize = 8;
10056
10057 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10061 return NULL;
10062
Thomas Wouters7e474022000-07-16 12:04:32 +000010063 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010064 i = 0; /* chars up to and including most recent \n or \r */
10065 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10067 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010069 if (tabsize > 0) {
10070 incr = tabsize - (j % tabsize); /* cannot overflow */
10071 if (j > PY_SSIZE_T_MAX - incr)
10072 goto overflow1;
10073 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010074 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 if (j > PY_SSIZE_T_MAX - 1)
10078 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 j++;
10080 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 if (i > PY_SSIZE_T_MAX - j)
10082 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010084 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085 }
10086 }
10087
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010088 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010089 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010090
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 /* Second pass: create output string and fill it */
10092 u = _PyUnicode_New(i + j);
10093 if (!u)
10094 return NULL;
10095
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010096 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 q = _PyUnicode_WSTR(u); /* next output char */
10098 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 if (tabsize > 0) {
10103 i = tabsize - (j % tabsize);
10104 j += i;
10105 while (i--) {
10106 if (q >= qe)
10107 goto overflow2;
10108 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010109 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010111 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 else {
10113 if (q >= qe)
10114 goto overflow2;
10115 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010116 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117 if (*p == '\n' || *p == '\r')
10118 j = 0;
10119 }
10120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 if (PyUnicode_READY(u) == -1) {
10122 Py_DECREF(u);
10123 return NULL;
10124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010126
10127 overflow2:
10128 Py_DECREF(u);
10129 overflow1:
10130 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132}
10133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010134PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136\n\
10137Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010138such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139arguments start and end are interpreted as in slice notation.\n\
10140\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010141Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
10143static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145{
Jesus Ceaac451502011-04-20 17:09:23 +020010146 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010147 Py_ssize_t start;
10148 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010149 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
Jesus Ceaac451502011-04-20 17:09:23 +020010151 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10152 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (PyUnicode_READY(self) == -1)
10156 return NULL;
10157 if (PyUnicode_READY(substring) == -1)
10158 return NULL;
10159
10160 result = any_find_slice(
10161 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10162 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164
10165 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (result == -2)
10168 return NULL;
10169
Christian Heimes217cfd12007-12-02 14:31:20 +000010170 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171}
10172
10173static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010174unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010176 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10177 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180}
10181
Guido van Rossumc2504932007-09-18 19:42:40 +000010182/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010183 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010184static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010185unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186{
Guido van Rossumc2504932007-09-18 19:42:40 +000010187 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010188 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 if (_PyUnicode_HASH(self) != -1)
10191 return _PyUnicode_HASH(self);
10192 if (PyUnicode_READY(self) == -1)
10193 return -1;
10194 len = PyUnicode_GET_LENGTH(self);
10195
10196 /* The hash function as a macro, gets expanded three times below. */
10197#define HASH(P) \
10198 x = (Py_uhash_t)*P << 7; \
10199 while (--len >= 0) \
10200 x = (1000003*x) ^ (Py_uhash_t)*P++;
10201
10202 switch (PyUnicode_KIND(self)) {
10203 case PyUnicode_1BYTE_KIND: {
10204 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10205 HASH(c);
10206 break;
10207 }
10208 case PyUnicode_2BYTE_KIND: {
10209 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10210 HASH(s);
10211 break;
10212 }
10213 default: {
10214 Py_UCS4 *l;
10215 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10216 "Impossible switch case in unicode_hash");
10217 l = PyUnicode_4BYTE_DATA(self);
10218 HASH(l);
10219 break;
10220 }
10221 }
10222 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10223
Guido van Rossumc2504932007-09-18 19:42:40 +000010224 if (x == -1)
10225 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010227 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010231PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010232 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010234Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
10236static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010239 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010240 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010241 Py_ssize_t start;
10242 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
Jesus Ceaac451502011-04-20 17:09:23 +020010244 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10245 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (PyUnicode_READY(self) == -1)
10249 return NULL;
10250 if (PyUnicode_READY(substring) == -1)
10251 return NULL;
10252
10253 result = any_find_slice(
10254 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10255 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257
10258 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 if (result == -2)
10261 return NULL;
10262
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263 if (result < 0) {
10264 PyErr_SetString(PyExc_ValueError, "substring not found");
10265 return NULL;
10266 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267
Christian Heimes217cfd12007-12-02 14:31:20 +000010268 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269}
10270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010271PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010274Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010275at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
10277static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010278unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 Py_ssize_t i, length;
10281 int kind;
10282 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 int cased;
10284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 if (PyUnicode_READY(self) == -1)
10286 return NULL;
10287 length = PyUnicode_GET_LENGTH(self);
10288 kind = PyUnicode_KIND(self);
10289 data = PyUnicode_DATA(self);
10290
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 if (length == 1)
10293 return PyBool_FromLong(
10294 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010296 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010298 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010299
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 for (i = 0; i < length; i++) {
10302 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010303
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10305 return PyBool_FromLong(0);
10306 else if (!cased && Py_UNICODE_ISLOWER(ch))
10307 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010309 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310}
10311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010313 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010315Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010316at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317
10318static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010319unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 Py_ssize_t i, length;
10322 int kind;
10323 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324 int cased;
10325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (PyUnicode_READY(self) == -1)
10327 return NULL;
10328 length = PyUnicode_GET_LENGTH(self);
10329 kind = PyUnicode_KIND(self);
10330 data = PyUnicode_DATA(self);
10331
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (length == 1)
10334 return PyBool_FromLong(
10335 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010340
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 for (i = 0; i < length; i++) {
10343 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010344
Benjamin Peterson29060642009-01-31 22:14:21 +000010345 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10346 return PyBool_FromLong(0);
10347 else if (!cased && Py_UNICODE_ISUPPER(ch))
10348 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010350 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351}
10352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010353PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010356Return True if S is a titlecased string and there is at least one\n\
10357character in S, i.e. upper- and titlecase characters may only\n\
10358follow uncased characters and lowercase characters only cased ones.\n\
10359Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360
10361static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010362unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 Py_ssize_t i, length;
10365 int kind;
10366 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367 int cased, previous_is_cased;
10368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 if (PyUnicode_READY(self) == -1)
10370 return NULL;
10371 length = PyUnicode_GET_LENGTH(self);
10372 kind = PyUnicode_KIND(self);
10373 data = PyUnicode_DATA(self);
10374
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (length == 1) {
10377 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10378 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10379 (Py_UNICODE_ISUPPER(ch) != 0));
10380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010382 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010385
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386 cased = 0;
10387 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 for (i = 0; i < length; i++) {
10389 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010390
Benjamin Peterson29060642009-01-31 22:14:21 +000010391 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10392 if (previous_is_cased)
10393 return PyBool_FromLong(0);
10394 previous_is_cased = 1;
10395 cased = 1;
10396 }
10397 else if (Py_UNICODE_ISLOWER(ch)) {
10398 if (!previous_is_cased)
10399 return PyBool_FromLong(0);
10400 previous_is_cased = 1;
10401 cased = 1;
10402 }
10403 else
10404 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010406 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407}
10408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010412Return True if all characters in S are whitespace\n\
10413and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414
10415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010416unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 Py_ssize_t i, length;
10419 int kind;
10420 void *data;
10421
10422 if (PyUnicode_READY(self) == -1)
10423 return NULL;
10424 length = PyUnicode_GET_LENGTH(self);
10425 kind = PyUnicode_KIND(self);
10426 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 if (length == 1)
10430 return PyBool_FromLong(
10431 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010433 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 for (i = 0; i < length; i++) {
10438 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010439 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010442 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443}
10444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010445PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010447\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010448Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010449and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010450
10451static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010452unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 Py_ssize_t i, length;
10455 int kind;
10456 void *data;
10457
10458 if (PyUnicode_READY(self) == -1)
10459 return NULL;
10460 length = PyUnicode_GET_LENGTH(self);
10461 kind = PyUnicode_KIND(self);
10462 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010463
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010464 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 if (length == 1)
10466 return PyBool_FromLong(
10467 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010468
10469 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 for (i = 0; i < length; i++) {
10474 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010475 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010476 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010477 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010478}
10479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010480PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010481 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010482\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010483Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010484and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010485
10486static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010487unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 int kind;
10490 void *data;
10491 Py_ssize_t len, i;
10492
10493 if (PyUnicode_READY(self) == -1)
10494 return NULL;
10495
10496 kind = PyUnicode_KIND(self);
10497 data = PyUnicode_DATA(self);
10498 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010499
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010500 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 if (len == 1) {
10502 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10503 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10504 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010505
10506 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010508 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 for (i = 0; i < len; i++) {
10511 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010512 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010514 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010515 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010516}
10517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010518PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010521Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010522False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
10524static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010525unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 Py_ssize_t i, length;
10528 int kind;
10529 void *data;
10530
10531 if (PyUnicode_READY(self) == -1)
10532 return NULL;
10533 length = PyUnicode_GET_LENGTH(self);
10534 kind = PyUnicode_KIND(self);
10535 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 if (length == 1)
10539 return PyBool_FromLong(
10540 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010542 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 for (i = 0; i < length; i++) {
10547 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010548 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010550 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551}
10552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010553PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010554 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010556Return True if all characters in S are digits\n\
10557and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558
10559static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010560unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 Py_ssize_t i, length;
10563 int kind;
10564 void *data;
10565
10566 if (PyUnicode_READY(self) == -1)
10567 return NULL;
10568 length = PyUnicode_GET_LENGTH(self);
10569 kind = PyUnicode_KIND(self);
10570 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 if (length == 1) {
10574 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10575 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010578 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 for (i = 0; i < length; i++) {
10583 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010584 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010586 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587}
10588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010589PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010592Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010593False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594
10595static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010596unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t i, length;
10599 int kind;
10600 void *data;
10601
10602 if (PyUnicode_READY(self) == -1)
10603 return NULL;
10604 length = PyUnicode_GET_LENGTH(self);
10605 kind = PyUnicode_KIND(self);
10606 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (length == 1)
10610 return PyBool_FromLong(
10611 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010613 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 for (i = 0; i < length; i++) {
10618 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622}
10623
Martin v. Löwis47383402007-08-15 07:32:56 +000010624int
10625PyUnicode_IsIdentifier(PyObject *self)
10626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 int kind;
10628 void *data;
10629 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010630 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 if (PyUnicode_READY(self) == -1) {
10633 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010634 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 }
10636
10637 /* Special case for empty strings */
10638 if (PyUnicode_GET_LENGTH(self) == 0)
10639 return 0;
10640 kind = PyUnicode_KIND(self);
10641 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010642
10643 /* PEP 3131 says that the first character must be in
10644 XID_Start and subsequent characters in XID_Continue,
10645 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010646 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010647 letters, digits, underscore). However, given the current
10648 definition of XID_Start and XID_Continue, it is sufficient
10649 to check just for these, except that _ must be allowed
10650 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010652 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010653 return 0;
10654
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010655 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010657 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010658 return 1;
10659}
10660
10661PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010662 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010663\n\
10664Return True if S is a valid identifier according\n\
10665to the language definition.");
10666
10667static PyObject*
10668unicode_isidentifier(PyObject *self)
10669{
10670 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10671}
10672
Georg Brandl559e5d72008-06-11 18:37:52 +000010673PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010674 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010675\n\
10676Return True if all characters in S are considered\n\
10677printable in repr() or S is empty, False otherwise.");
10678
10679static PyObject*
10680unicode_isprintable(PyObject *self)
10681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 Py_ssize_t i, length;
10683 int kind;
10684 void *data;
10685
10686 if (PyUnicode_READY(self) == -1)
10687 return NULL;
10688 length = PyUnicode_GET_LENGTH(self);
10689 kind = PyUnicode_KIND(self);
10690 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010691
10692 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (length == 1)
10694 return PyBool_FromLong(
10695 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 for (i = 0; i < length; i++) {
10698 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010699 Py_RETURN_FALSE;
10700 }
10701 }
10702 Py_RETURN_TRUE;
10703}
10704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010705PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010706 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707\n\
10708Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010709iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
10711static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010712unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010714 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715}
10716
Martin v. Löwis18e16552006-02-15 17:27:45 +000010717static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718unicode_length(PyUnicodeObject *self)
10719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (PyUnicode_READY(self) == -1)
10721 return -1;
10722 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723}
10724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010728Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010729done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
10731static PyObject *
10732unicode_ljust(PyUnicodeObject *self, PyObject *args)
10733{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010734 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 Py_UCS4 fillchar = ' ';
10736
10737 if (PyUnicode_READY(self) == -1)
10738 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010739
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010740 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 return NULL;
10742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744 Py_INCREF(self);
10745 return (PyObject*) self;
10746 }
10747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749}
10750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010752 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010754Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
10756static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010757unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 return fixup(self, fixlower);
10760}
10761
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010762#define LEFTSTRIP 0
10763#define RIGHTSTRIP 1
10764#define BOTHSTRIP 2
10765
10766/* Arrays indexed by above */
10767static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10768
10769#define STRIPNAME(i) (stripformat[i]+3)
10770
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010771/* externally visible for str.strip(unicode) */
10772PyObject *
10773_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 void *data;
10776 int kind;
10777 Py_ssize_t i, j, len;
10778 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10781 return NULL;
10782
10783 kind = PyUnicode_KIND(self);
10784 data = PyUnicode_DATA(self);
10785 len = PyUnicode_GET_LENGTH(self);
10786 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10787 PyUnicode_DATA(sepobj),
10788 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789
Benjamin Peterson14339b62009-01-31 16:36:08 +000010790 i = 0;
10791 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 while (i < len &&
10793 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 i++;
10795 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010796 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010797
Benjamin Peterson14339b62009-01-31 16:36:08 +000010798 j = len;
10799 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 do {
10801 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 } while (j >= i &&
10803 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010804 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010805 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010806
Victor Stinner12bab6d2011-10-01 01:53:49 +020010807 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808}
10809
10810PyObject*
10811PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10812{
10813 unsigned char *data;
10814 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010815 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816
Victor Stinnerde636f32011-10-01 03:55:54 +020010817 if (PyUnicode_READY(self) == -1)
10818 return NULL;
10819
10820 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10821
Victor Stinner12bab6d2011-10-01 01:53:49 +020010822 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010824 if (PyUnicode_CheckExact(self)) {
10825 Py_INCREF(self);
10826 return self;
10827 }
10828 else
10829 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 }
10831
Victor Stinner12bab6d2011-10-01 01:53:49 +020010832 length = end - start;
10833 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010834 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835
Victor Stinnerde636f32011-10-01 03:55:54 +020010836 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010837 PyErr_SetString(PyExc_IndexError, "string index out of range");
10838 return NULL;
10839 }
10840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 kind = PyUnicode_KIND(self);
10842 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010843 return PyUnicode_FromKindAndData(kind,
10844 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010845 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
10848static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010849do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 int kind;
10852 void *data;
10853 Py_ssize_t len, i, j;
10854
10855 if (PyUnicode_READY(self) == -1)
10856 return NULL;
10857
10858 kind = PyUnicode_KIND(self);
10859 data = PyUnicode_DATA(self);
10860 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010861
Benjamin Peterson14339b62009-01-31 16:36:08 +000010862 i = 0;
10863 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010865 i++;
10866 }
10867 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010868
Benjamin Peterson14339b62009-01-31 16:36:08 +000010869 j = len;
10870 if (striptype != LEFTSTRIP) {
10871 do {
10872 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010874 j++;
10875 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010876
Victor Stinner12bab6d2011-10-01 01:53:49 +020010877 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878}
10879
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010880
10881static PyObject *
10882do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10883{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010884 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010885
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10887 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010888
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889 if (sep != NULL && sep != Py_None) {
10890 if (PyUnicode_Check(sep))
10891 return _PyUnicode_XStrip(self, striptype, sep);
10892 else {
10893 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010894 "%s arg must be None or str",
10895 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010896 return NULL;
10897 }
10898 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010899
Benjamin Peterson14339b62009-01-31 16:36:08 +000010900 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010901}
10902
10903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010904PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010906\n\
10907Return a copy of the string S with leading and trailing\n\
10908whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010909If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010910
10911static PyObject *
10912unicode_strip(PyUnicodeObject *self, PyObject *args)
10913{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010914 if (PyTuple_GET_SIZE(args) == 0)
10915 return do_strip(self, BOTHSTRIP); /* Common case */
10916 else
10917 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010918}
10919
10920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010921PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010922 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010923\n\
10924Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010925If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010926
10927static PyObject *
10928unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10929{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010930 if (PyTuple_GET_SIZE(args) == 0)
10931 return do_strip(self, LEFTSTRIP); /* Common case */
10932 else
10933 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010934}
10935
10936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010937PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010939\n\
10940Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010941If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942
10943static PyObject *
10944unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10945{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010946 if (PyTuple_GET_SIZE(args) == 0)
10947 return do_strip(self, RIGHTSTRIP); /* Common case */
10948 else
10949 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010950}
10951
10952
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010954unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955{
10956 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
Georg Brandl222de0f2009-04-12 12:01:50 +000010959 if (len < 1) {
10960 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010961 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963
Tim Peters7a29bd52001-09-12 03:03:31 +000010964 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965 /* no repeat, return original string */
10966 Py_INCREF(str);
10967 return (PyObject*) str;
10968 }
Tim Peters8f422462000-09-09 06:13:41 +000010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (PyUnicode_READY(str) == -1)
10971 return NULL;
10972
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010973 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010974 PyErr_SetString(PyExc_OverflowError,
10975 "repeated string is too long");
10976 return NULL;
10977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981 if (!u)
10982 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010983 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 if (PyUnicode_GET_LENGTH(str) == 1) {
10986 const int kind = PyUnicode_KIND(str);
10987 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10988 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010989 if (kind == PyUnicode_1BYTE_KIND)
10990 memset(to, (unsigned char)fill_char, len);
10991 else {
10992 for (n = 0; n < len; ++n)
10993 PyUnicode_WRITE(kind, to, n, fill_char);
10994 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 }
10996 else {
10997 /* number of characters copied this far */
10998 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10999 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11000 char *to = (char *) PyUnicode_DATA(u);
11001 Py_MEMCPY(to, PyUnicode_DATA(str),
11002 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 n = (done <= nchars-done) ? done : nchars-done;
11005 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011006 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 }
11009
11010 return (PyObject*) u;
11011}
11012
Alexander Belopolsky40018472011-02-26 01:02:56 +000011013PyObject *
11014PyUnicode_Replace(PyObject *obj,
11015 PyObject *subobj,
11016 PyObject *replobj,
11017 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018{
11019 PyObject *self;
11020 PyObject *str1;
11021 PyObject *str2;
11022 PyObject *result;
11023
11024 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011025 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011028 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 Py_DECREF(self);
11030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 }
11032 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011033 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 Py_DECREF(self);
11035 Py_DECREF(str1);
11036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 Py_DECREF(self);
11040 Py_DECREF(str1);
11041 Py_DECREF(str2);
11042 return result;
11043}
11044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011045PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011046 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047\n\
11048Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011049old replaced by new. If the optional argument count is\n\
11050given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
11052static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 PyObject *str1;
11056 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011057 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 PyObject *result;
11059
Martin v. Löwis18e16552006-02-15 17:27:45 +000011060 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 str1 = PyUnicode_FromObject(str1);
11065 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11066 return NULL;
11067 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011068 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011069 Py_DECREF(str1);
11070 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
11073 result = replace(self, str1, str2, maxcount);
11074
11075 Py_DECREF(str1);
11076 Py_DECREF(str2);
11077 return result;
11078}
11079
Alexander Belopolsky40018472011-02-26 01:02:56 +000011080static PyObject *
11081unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011083 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 Py_ssize_t isize;
11085 Py_ssize_t osize, squote, dquote, i, o;
11086 Py_UCS4 max, quote;
11087 int ikind, okind;
11088 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011091 return NULL;
11092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 isize = PyUnicode_GET_LENGTH(unicode);
11094 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 /* Compute length of output, quote characters, and
11097 maximum character */
11098 osize = 2; /* quotes */
11099 max = 127;
11100 squote = dquote = 0;
11101 ikind = PyUnicode_KIND(unicode);
11102 for (i = 0; i < isize; i++) {
11103 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11104 switch (ch) {
11105 case '\'': squote++; osize++; break;
11106 case '"': dquote++; osize++; break;
11107 case '\\': case '\t': case '\r': case '\n':
11108 osize += 2; break;
11109 default:
11110 /* Fast-path ASCII */
11111 if (ch < ' ' || ch == 0x7f)
11112 osize += 4; /* \xHH */
11113 else if (ch < 0x7f)
11114 osize++;
11115 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11116 osize++;
11117 max = ch > max ? ch : max;
11118 }
11119 else if (ch < 0x100)
11120 osize += 4; /* \xHH */
11121 else if (ch < 0x10000)
11122 osize += 6; /* \uHHHH */
11123 else
11124 osize += 10; /* \uHHHHHHHH */
11125 }
11126 }
11127
11128 quote = '\'';
11129 if (squote) {
11130 if (dquote)
11131 /* Both squote and dquote present. Use squote,
11132 and escape them */
11133 osize += squote;
11134 else
11135 quote = '"';
11136 }
11137
11138 repr = PyUnicode_New(osize, max);
11139 if (repr == NULL)
11140 return NULL;
11141 okind = PyUnicode_KIND(repr);
11142 odata = PyUnicode_DATA(repr);
11143
11144 PyUnicode_WRITE(okind, odata, 0, quote);
11145 PyUnicode_WRITE(okind, odata, osize-1, quote);
11146
11147 for (i = 0, o = 1; i < isize; i++) {
11148 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011149
11150 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if ((ch == quote) || (ch == '\\')) {
11152 PyUnicode_WRITE(okind, odata, o++, '\\');
11153 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011154 continue;
11155 }
11156
Benjamin Peterson29060642009-01-31 22:14:21 +000011157 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011158 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 PyUnicode_WRITE(okind, odata, o++, '\\');
11160 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011161 }
11162 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 PyUnicode_WRITE(okind, odata, o++, '\\');
11164 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011165 }
11166 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 PyUnicode_WRITE(okind, odata, o++, '\\');
11168 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011169 }
11170
11171 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011172 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 PyUnicode_WRITE(okind, odata, o++, '\\');
11174 PyUnicode_WRITE(okind, odata, o++, 'x');
11175 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11176 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011177 }
11178
Georg Brandl559e5d72008-06-11 18:37:52 +000011179 /* Copy ASCII characters as-is */
11180 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011182 }
11183
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011185 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011186 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011187 (categories Z* and C* except ASCII space)
11188 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011190 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 if (ch <= 0xff) {
11192 PyUnicode_WRITE(okind, odata, o++, '\\');
11193 PyUnicode_WRITE(okind, odata, o++, 'x');
11194 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11195 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011196 }
11197 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 else if (ch >= 0x10000) {
11199 PyUnicode_WRITE(okind, odata, o++, '\\');
11200 PyUnicode_WRITE(okind, odata, o++, 'U');
11201 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11202 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11204 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11205 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11206 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11207 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11208 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011209 }
11210 /* Map 16-bit characters to '\uxxxx' */
11211 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 PyUnicode_WRITE(okind, odata, o++, '\\');
11213 PyUnicode_WRITE(okind, odata, o++, 'u');
11214 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11215 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11216 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11217 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011218 }
11219 }
11220 /* Copy characters as-is */
11221 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011223 }
11224 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011227 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228}
11229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011230PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232\n\
11233Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011234such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235arguments start and end are interpreted as in slice notation.\n\
11236\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011237Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
11239static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241{
Jesus Ceaac451502011-04-20 17:09:23 +020011242 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011243 Py_ssize_t start;
11244 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011245 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
Jesus Ceaac451502011-04-20 17:09:23 +020011247 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11248 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011249 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251 if (PyUnicode_READY(self) == -1)
11252 return NULL;
11253 if (PyUnicode_READY(substring) == -1)
11254 return NULL;
11255
11256 result = any_find_slice(
11257 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11258 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011259 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
11261 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 if (result == -2)
11264 return NULL;
11265
Christian Heimes217cfd12007-12-02 14:31:20 +000011266 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267}
11268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011269PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
11274static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276{
Jesus Ceaac451502011-04-20 17:09:23 +020011277 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011278 Py_ssize_t start;
11279 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011280 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Jesus Ceaac451502011-04-20 17:09:23 +020011282 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11283 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (PyUnicode_READY(self) == -1)
11287 return NULL;
11288 if (PyUnicode_READY(substring) == -1)
11289 return NULL;
11290
11291 result = any_find_slice(
11292 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11293 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011294 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 if (result == -2)
11299 return NULL;
11300
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 if (result < 0) {
11302 PyErr_SetString(PyExc_ValueError, "substring not found");
11303 return NULL;
11304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305
Christian Heimes217cfd12007-12-02 14:31:20 +000011306 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307}
11308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011312Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011313done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315static PyObject *
11316unicode_rjust(PyUnicodeObject *self, PyObject *args)
11317{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011318 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 Py_UCS4 fillchar = ' ';
11320
Victor Stinnere9a29352011-10-01 02:14:59 +020011321 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011323
Victor Stinnere9a29352011-10-01 02:14:59 +020011324 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 return NULL;
11326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 Py_INCREF(self);
11329 return (PyObject*) self;
11330 }
11331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333}
11334
Alexander Belopolsky40018472011-02-26 01:02:56 +000011335PyObject *
11336PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337{
11338 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011339
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 s = PyUnicode_FromObject(s);
11341 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011342 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 if (sep != NULL) {
11344 sep = PyUnicode_FromObject(sep);
11345 if (sep == NULL) {
11346 Py_DECREF(s);
11347 return NULL;
11348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 }
11350
11351 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11352
11353 Py_DECREF(s);
11354 Py_XDECREF(sep);
11355 return result;
11356}
11357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011358PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360\n\
11361Return a list of the words in S, using sep as the\n\
11362delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011363splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011364whitespace string is a separator and empty strings are\n\
11365removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
11367static PyObject*
11368unicode_split(PyUnicodeObject *self, PyObject *args)
11369{
11370 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011371 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Martin v. Löwis18e16552006-02-15 17:27:45 +000011373 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 return NULL;
11375
11376 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Thomas Wouters477c8d52006-05-27 19:21:47 +000011384PyObject *
11385PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11386{
11387 PyObject* str_obj;
11388 PyObject* sep_obj;
11389 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 int kind1, kind2, kind;
11391 void *buf1 = NULL, *buf2 = NULL;
11392 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011393
11394 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011395 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011397 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011399 Py_DECREF(str_obj);
11400 return NULL;
11401 }
11402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 kind1 = PyUnicode_KIND(str_in);
11404 kind2 = PyUnicode_KIND(sep_obj);
11405 kind = kind1 > kind2 ? kind1 : kind2;
11406 buf1 = PyUnicode_DATA(str_in);
11407 if (kind1 != kind)
11408 buf1 = _PyUnicode_AsKind(str_in, kind);
11409 if (!buf1)
11410 goto onError;
11411 buf2 = PyUnicode_DATA(sep_obj);
11412 if (kind2 != kind)
11413 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11414 if (!buf2)
11415 goto onError;
11416 len1 = PyUnicode_GET_LENGTH(str_obj);
11417 len2 = PyUnicode_GET_LENGTH(sep_obj);
11418
11419 switch(PyUnicode_KIND(str_in)) {
11420 case PyUnicode_1BYTE_KIND:
11421 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11422 break;
11423 case PyUnicode_2BYTE_KIND:
11424 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11425 break;
11426 case PyUnicode_4BYTE_KIND:
11427 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11428 break;
11429 default:
11430 assert(0);
11431 out = 0;
11432 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011433
11434 Py_DECREF(sep_obj);
11435 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (kind1 != kind)
11437 PyMem_Free(buf1);
11438 if (kind2 != kind)
11439 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011440
11441 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 onError:
11443 Py_DECREF(sep_obj);
11444 Py_DECREF(str_obj);
11445 if (kind1 != kind && buf1)
11446 PyMem_Free(buf1);
11447 if (kind2 != kind && buf2)
11448 PyMem_Free(buf2);
11449 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450}
11451
11452
11453PyObject *
11454PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11455{
11456 PyObject* str_obj;
11457 PyObject* sep_obj;
11458 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 int kind1, kind2, kind;
11460 void *buf1 = NULL, *buf2 = NULL;
11461 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011462
11463 str_obj = PyUnicode_FromObject(str_in);
11464 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011466 sep_obj = PyUnicode_FromObject(sep_in);
11467 if (!sep_obj) {
11468 Py_DECREF(str_obj);
11469 return NULL;
11470 }
11471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 kind1 = PyUnicode_KIND(str_in);
11473 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011474 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 buf1 = PyUnicode_DATA(str_in);
11476 if (kind1 != kind)
11477 buf1 = _PyUnicode_AsKind(str_in, kind);
11478 if (!buf1)
11479 goto onError;
11480 buf2 = PyUnicode_DATA(sep_obj);
11481 if (kind2 != kind)
11482 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11483 if (!buf2)
11484 goto onError;
11485 len1 = PyUnicode_GET_LENGTH(str_obj);
11486 len2 = PyUnicode_GET_LENGTH(sep_obj);
11487
11488 switch(PyUnicode_KIND(str_in)) {
11489 case PyUnicode_1BYTE_KIND:
11490 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11491 break;
11492 case PyUnicode_2BYTE_KIND:
11493 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11494 break;
11495 case PyUnicode_4BYTE_KIND:
11496 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11497 break;
11498 default:
11499 assert(0);
11500 out = 0;
11501 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011502
11503 Py_DECREF(sep_obj);
11504 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 if (kind1 != kind)
11506 PyMem_Free(buf1);
11507 if (kind2 != kind)
11508 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011509
11510 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 onError:
11512 Py_DECREF(sep_obj);
11513 Py_DECREF(str_obj);
11514 if (kind1 != kind && buf1)
11515 PyMem_Free(buf1);
11516 if (kind2 != kind && buf2)
11517 PyMem_Free(buf2);
11518 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519}
11520
11521PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011523\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011524Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011525the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011526found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527
11528static PyObject*
11529unicode_partition(PyUnicodeObject *self, PyObject *separator)
11530{
11531 return PyUnicode_Partition((PyObject *)self, separator);
11532}
11533
11534PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011535 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011536\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011537Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011538the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011539separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011540
11541static PyObject*
11542unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11543{
11544 return PyUnicode_RPartition((PyObject *)self, separator);
11545}
11546
Alexander Belopolsky40018472011-02-26 01:02:56 +000011547PyObject *
11548PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011549{
11550 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011551
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011552 s = PyUnicode_FromObject(s);
11553 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011554 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 if (sep != NULL) {
11556 sep = PyUnicode_FromObject(sep);
11557 if (sep == NULL) {
11558 Py_DECREF(s);
11559 return NULL;
11560 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011561 }
11562
11563 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11564
11565 Py_DECREF(s);
11566 Py_XDECREF(sep);
11567 return result;
11568}
11569
11570PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011572\n\
11573Return a list of the words in S, using sep as the\n\
11574delimiter string, starting at the end of the string and\n\
11575working to the front. If maxsplit is given, at most maxsplit\n\
11576splits are done. If sep is not specified, any whitespace string\n\
11577is a separator.");
11578
11579static PyObject*
11580unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11581{
11582 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011583 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011584
Martin v. Löwis18e16552006-02-15 17:27:45 +000011585 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011586 return NULL;
11587
11588 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011590 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011592 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011594}
11595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011596PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598\n\
11599Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011600Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011601is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602
11603static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011604unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011606 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011607 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011609 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11610 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611 return NULL;
11612
Guido van Rossum86662912000-04-11 15:38:46 +000011613 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
11616static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011617PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618{
Walter Dörwald346737f2007-05-31 10:44:43 +000011619 if (PyUnicode_CheckExact(self)) {
11620 Py_INCREF(self);
11621 return self;
11622 } else
11623 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011624 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625}
11626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629\n\
11630Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011631and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632
11633static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011634unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636 return fixup(self, fixswapcase);
11637}
11638
Georg Brandlceee0772007-11-27 23:48:05 +000011639PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011641\n\
11642Return a translation table usable for str.translate().\n\
11643If there is only one argument, it must be a dictionary mapping Unicode\n\
11644ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011645Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011646If there are two arguments, they must be strings of equal length, and\n\
11647in the resulting dictionary, each character in x will be mapped to the\n\
11648character at the same position in y. If there is a third argument, it\n\
11649must be a string, whose characters will be mapped to None in the result.");
11650
11651static PyObject*
11652unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11653{
11654 PyObject *x, *y = NULL, *z = NULL;
11655 PyObject *new = NULL, *key, *value;
11656 Py_ssize_t i = 0;
11657 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011658
Georg Brandlceee0772007-11-27 23:48:05 +000011659 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11660 return NULL;
11661 new = PyDict_New();
11662 if (!new)
11663 return NULL;
11664 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 int x_kind, y_kind, z_kind;
11666 void *x_data, *y_data, *z_data;
11667
Georg Brandlceee0772007-11-27 23:48:05 +000011668 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011669 if (!PyUnicode_Check(x)) {
11670 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11671 "be a string if there is a second argument");
11672 goto err;
11673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011675 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11676 "arguments must have equal length");
11677 goto err;
11678 }
11679 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 x_kind = PyUnicode_KIND(x);
11681 y_kind = PyUnicode_KIND(y);
11682 x_data = PyUnicode_DATA(x);
11683 y_data = PyUnicode_DATA(y);
11684 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11685 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11686 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011687 if (!key || !value)
11688 goto err;
11689 res = PyDict_SetItem(new, key, value);
11690 Py_DECREF(key);
11691 Py_DECREF(value);
11692 if (res < 0)
11693 goto err;
11694 }
11695 /* create entries for deleting chars in z */
11696 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 z_kind = PyUnicode_KIND(z);
11698 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011699 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011701 if (!key)
11702 goto err;
11703 res = PyDict_SetItem(new, key, Py_None);
11704 Py_DECREF(key);
11705 if (res < 0)
11706 goto err;
11707 }
11708 }
11709 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 int kind;
11711 void *data;
11712
Georg Brandlceee0772007-11-27 23:48:05 +000011713 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011714 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011715 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11716 "to maketrans it must be a dict");
11717 goto err;
11718 }
11719 /* copy entries into the new dict, converting string keys to int keys */
11720 while (PyDict_Next(x, &i, &key, &value)) {
11721 if (PyUnicode_Check(key)) {
11722 /* convert string keys to integer keys */
11723 PyObject *newkey;
11724 if (PyUnicode_GET_SIZE(key) != 1) {
11725 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11726 "table must be of length 1");
11727 goto err;
11728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 kind = PyUnicode_KIND(key);
11730 data = PyUnicode_DATA(key);
11731 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011732 if (!newkey)
11733 goto err;
11734 res = PyDict_SetItem(new, newkey, value);
11735 Py_DECREF(newkey);
11736 if (res < 0)
11737 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011738 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011739 /* just keep integer keys */
11740 if (PyDict_SetItem(new, key, value) < 0)
11741 goto err;
11742 } else {
11743 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11744 "be strings or integers");
11745 goto err;
11746 }
11747 }
11748 }
11749 return new;
11750 err:
11751 Py_DECREF(new);
11752 return NULL;
11753}
11754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011755PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757\n\
11758Return a copy of the string S, where all characters have been mapped\n\
11759through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011760Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011761Unmapped characters are left untouched. Characters mapped to None\n\
11762are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768}
11769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
11775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011776unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 return fixup(self, fixupper);
11779}
11780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011781PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011784Pad a numeric string S with zeros on the left, to fill a field\n\
11785of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
11787static PyObject *
11788unicode_zfill(PyUnicodeObject *self, PyObject *args)
11789{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011790 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011792 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 int kind;
11794 void *data;
11795 Py_UCS4 chr;
11796
11797 if (PyUnicode_READY(self) == -1)
11798 return NULL;
11799
Martin v. Löwis18e16552006-02-15 17:27:45 +000011800 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801 return NULL;
11802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011804 if (PyUnicode_CheckExact(self)) {
11805 Py_INCREF(self);
11806 return (PyObject*) self;
11807 }
11808 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011809 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810 }
11811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
11814 u = pad(self, fill, 0, '0');
11815
Walter Dörwald068325e2002-04-15 13:36:47 +000011816 if (u == NULL)
11817 return NULL;
11818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 kind = PyUnicode_KIND(u);
11820 data = PyUnicode_DATA(u);
11821 chr = PyUnicode_READ(kind, data, fill);
11822
11823 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 PyUnicode_WRITE(kind, data, 0, chr);
11826 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 }
11828
11829 return (PyObject*) u;
11830}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
11832#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011833static PyObject *
11834unicode__decimal2ascii(PyObject *self)
11835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011837}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838#endif
11839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011840PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011843Return True if S starts with the specified prefix, False otherwise.\n\
11844With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011845With optional end, stop comparing S at that position.\n\
11846prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847
11848static PyObject *
11849unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011852 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011854 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011855 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011856 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Jesus Ceaac451502011-04-20 17:09:23 +020011858 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011860 if (PyTuple_Check(subobj)) {
11861 Py_ssize_t i;
11862 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11863 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011865 if (substring == NULL)
11866 return NULL;
11867 result = tailmatch(self, substring, start, end, -1);
11868 Py_DECREF(substring);
11869 if (result) {
11870 Py_RETURN_TRUE;
11871 }
11872 }
11873 /* nothing matched */
11874 Py_RETURN_FALSE;
11875 }
11876 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011877 if (substring == NULL) {
11878 if (PyErr_ExceptionMatches(PyExc_TypeError))
11879 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11880 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011882 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011883 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011885 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
11888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011889PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011892Return True if S ends with the specified suffix, False otherwise.\n\
11893With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011894With optional end, stop comparing S at that position.\n\
11895suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
11897static PyObject *
11898unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011901 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011903 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011904 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011905 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Jesus Ceaac451502011-04-20 17:09:23 +020011907 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011909 if (PyTuple_Check(subobj)) {
11910 Py_ssize_t i;
11911 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11912 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011914 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011916 result = tailmatch(self, substring, start, end, +1);
11917 Py_DECREF(substring);
11918 if (result) {
11919 Py_RETURN_TRUE;
11920 }
11921 }
11922 Py_RETURN_FALSE;
11923 }
11924 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011925 if (substring == NULL) {
11926 if (PyErr_ExceptionMatches(PyExc_TypeError))
11927 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11928 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011930 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011931 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011933 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934}
11935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011937
11938PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011940\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011941Return a formatted version of S, using substitutions from args and kwargs.\n\
11942The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011943
Eric Smith27bbca62010-11-04 17:06:58 +000011944PyDoc_STRVAR(format_map__doc__,
11945 "S.format_map(mapping) -> str\n\
11946\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011947Return a formatted version of S, using substitutions from mapping.\n\
11948The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011949
Eric Smith4a7d76d2008-05-30 18:10:19 +000011950static PyObject *
11951unicode__format__(PyObject* self, PyObject* args)
11952{
11953 PyObject *format_spec;
11954
11955 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11956 return NULL;
11957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11959 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011960}
11961
Eric Smith8c663262007-08-25 02:26:07 +000011962PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011964\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011965Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011966
11967static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011968unicode__sizeof__(PyUnicodeObject *v)
11969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 Py_ssize_t size;
11971
11972 /* If it's a compact object, account for base structure +
11973 character data. */
11974 if (PyUnicode_IS_COMPACT_ASCII(v))
11975 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11976 else if (PyUnicode_IS_COMPACT(v))
11977 size = sizeof(PyCompactUnicodeObject) +
11978 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11979 else {
11980 /* If it is a two-block object, account for base object, and
11981 for character block if present. */
11982 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011983 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 size += (PyUnicode_GET_LENGTH(v) + 1) *
11985 PyUnicode_CHARACTER_SIZE(v);
11986 }
11987 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011988 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011990 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011992 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011993 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994
11995 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011996}
11997
11998PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012000
12001static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012002unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012003{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012004 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 if (!copy)
12006 return NULL;
12007 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012008}
12009
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010static PyMethodDef unicode_methods[] = {
12011
12012 /* Order is according to common usage: often used methods should
12013 appear first, since lookup is done sequentially. */
12014
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012015 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012016 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12017 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012018 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012019 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12020 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12021 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12022 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12023 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12024 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12025 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012026 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012027 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12028 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12029 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012030 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012031 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12032 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12033 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012034 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012035 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012036 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012037 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012038 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12039 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12040 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12041 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12042 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12043 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12044 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12045 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12046 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12047 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12048 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12049 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12050 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12051 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012052 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012053 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012054 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012055 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012056 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012057 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012058 {"maketrans", (PyCFunction) unicode_maketrans,
12059 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012060 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012061#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012062 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063#endif
12064
12065#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012066 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012067 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068#endif
12069
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071 {NULL, NULL}
12072};
12073
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012074static PyObject *
12075unicode_mod(PyObject *v, PyObject *w)
12076{
Brian Curtindfc80e32011-08-10 20:28:54 -050012077 if (!PyUnicode_Check(v))
12078 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012080}
12081
12082static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012083 0, /*nb_add*/
12084 0, /*nb_subtract*/
12085 0, /*nb_multiply*/
12086 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012087};
12088
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 (lenfunc) unicode_length, /* sq_length */
12091 PyUnicode_Concat, /* sq_concat */
12092 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12093 (ssizeargfunc) unicode_getitem, /* sq_item */
12094 0, /* sq_slice */
12095 0, /* sq_ass_item */
12096 0, /* sq_ass_slice */
12097 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098};
12099
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012100static PyObject*
12101unicode_subscript(PyUnicodeObject* self, PyObject* item)
12102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (PyUnicode_READY(self) == -1)
12104 return NULL;
12105
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012106 if (PyIndex_Check(item)) {
12107 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012108 if (i == -1 && PyErr_Occurred())
12109 return NULL;
12110 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012112 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012113 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012114 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012116 Py_UNICODE* result_buf;
12117 PyObject* result;
12118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012121 return NULL;
12122 }
12123
12124 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 return PyUnicode_New(0, 0);
12126 } else if (start == 0 && step == 1 &&
12127 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012128 PyUnicode_CheckExact(self)) {
12129 Py_INCREF(self);
12130 return (PyObject *)self;
12131 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012132 return PyUnicode_Substring((PyObject*)self,
12133 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012134 } else {
12135 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012136 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12137 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 if (result_buf == NULL)
12140 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012141
12142 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12143 result_buf[i] = source_buf[cur];
12144 }
Tim Petersced69f82003-09-16 20:30:58 +000012145
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012146 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012147 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012148 return result;
12149 }
12150 } else {
12151 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12152 return NULL;
12153 }
12154}
12155
12156static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 (lenfunc)unicode_length, /* mp_length */
12158 (binaryfunc)unicode_subscript, /* mp_subscript */
12159 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012160};
12161
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163/* Helpers for PyUnicode_Format() */
12164
12165static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012166getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012168 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 (*p_argidx)++;
12171 if (arglen < 0)
12172 return args;
12173 else
12174 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 }
12176 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178 return NULL;
12179}
12180
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012181/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012183static PyObject *
12184formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012186 char *p;
12187 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012189
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190 x = PyFloat_AsDouble(v);
12191 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012192 return NULL;
12193
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012196
Eric Smith0923d1d2009-04-16 20:16:10 +000012197 p = PyOS_double_to_string(x, type, prec,
12198 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012199 if (p == NULL)
12200 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012202 PyMem_Free(p);
12203 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204}
12205
Tim Peters38fd5b62000-09-21 05:43:11 +000012206static PyObject*
12207formatlong(PyObject *val, int flags, int prec, int type)
12208{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012209 char *buf;
12210 int len;
12211 PyObject *str; /* temporary string object. */
12212 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012213
Benjamin Peterson14339b62009-01-31 16:36:08 +000012214 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12215 if (!str)
12216 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012218 Py_DECREF(str);
12219 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012220}
12221
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012224 size_t buflen,
12225 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012227 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012228 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (PyUnicode_GET_LENGTH(v) == 1) {
12230 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 buf[1] = '\0';
12232 return 1;
12233 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 goto onError;
12235 }
12236 else {
12237 /* Integer input truncated to a character */
12238 long x;
12239 x = PyLong_AsLong(v);
12240 if (x == -1 && PyErr_Occurred())
12241 goto onError;
12242
12243 if (x < 0 || x > 0x10ffff) {
12244 PyErr_SetString(PyExc_OverflowError,
12245 "%c arg not in range(0x110000)");
12246 return -1;
12247 }
12248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012250 buf[1] = '\0';
12251 return 1;
12252 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012253
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012255 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012257 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258}
12259
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012260/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012261 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012262*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012263#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012264
Alexander Belopolsky40018472011-02-26 01:02:56 +000012265PyObject *
12266PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 void *fmt;
12269 int fmtkind;
12270 PyObject *result;
12271 Py_UCS4 *res, *res0;
12272 Py_UCS4 max;
12273 int kind;
12274 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012278
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 PyErr_BadInternalCall();
12281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12284 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 fmt = PyUnicode_DATA(uformat);
12287 fmtkind = PyUnicode_KIND(uformat);
12288 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12289 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
12291 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12293 if (res0 == NULL) {
12294 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297
12298 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 arglen = PyTuple_Size(args);
12300 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301 }
12302 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 arglen = -1;
12304 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012306 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012307 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309
12310 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 if (--rescnt < 0) {
12313 rescnt = fmtcnt + 100;
12314 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12316 if (res0 == NULL){
12317 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 }
12320 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 }
12325 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 /* Got a format specifier */
12327 int flags = 0;
12328 Py_ssize_t width = -1;
12329 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 Py_UCS4 c = '\0';
12331 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 int isnumok;
12333 PyObject *v = NULL;
12334 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 void *pbuf;
12336 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012337 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 Py_ssize_t len, len1;
12339 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 fmtpos++;
12342 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12343 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 Py_ssize_t keylen;
12345 PyObject *key;
12346 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012347
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 if (dict == NULL) {
12349 PyErr_SetString(PyExc_TypeError,
12350 "format requires a mapping");
12351 goto onError;
12352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 /* Skip over balanced parentheses */
12357 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 if (fmtcnt < 0 || pcount > 0) {
12366 PyErr_SetString(PyExc_ValueError,
12367 "incomplete format key");
12368 goto onError;
12369 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012370 key = PyUnicode_Substring((PyObject*)uformat,
12371 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 if (key == NULL)
12373 goto onError;
12374 if (args_owned) {
12375 Py_DECREF(args);
12376 args_owned = 0;
12377 }
12378 args = PyObject_GetItem(dict, key);
12379 Py_DECREF(key);
12380 if (args == NULL) {
12381 goto onError;
12382 }
12383 args_owned = 1;
12384 arglen = -1;
12385 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012386 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 case '-': flags |= F_LJUST; continue;
12390 case '+': flags |= F_SIGN; continue;
12391 case ' ': flags |= F_BLANK; continue;
12392 case '#': flags |= F_ALT; continue;
12393 case '0': flags |= F_ZERO; continue;
12394 }
12395 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012396 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 if (c == '*') {
12398 v = getnextarg(args, arglen, &argidx);
12399 if (v == NULL)
12400 goto onError;
12401 if (!PyLong_Check(v)) {
12402 PyErr_SetString(PyExc_TypeError,
12403 "* wants int");
12404 goto onError;
12405 }
12406 width = PyLong_AsLong(v);
12407 if (width == -1 && PyErr_Occurred())
12408 goto onError;
12409 if (width < 0) {
12410 flags |= F_LJUST;
12411 width = -width;
12412 }
12413 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 }
12416 else if (c >= '0' && c <= '9') {
12417 width = c - '0';
12418 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 if (c < '0' || c > '9')
12421 break;
12422 if ((width*10) / 10 != width) {
12423 PyErr_SetString(PyExc_ValueError,
12424 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012425 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 }
12427 width = width*10 + (c - '0');
12428 }
12429 }
12430 if (c == '.') {
12431 prec = 0;
12432 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 if (c == '*') {
12435 v = getnextarg(args, arglen, &argidx);
12436 if (v == NULL)
12437 goto onError;
12438 if (!PyLong_Check(v)) {
12439 PyErr_SetString(PyExc_TypeError,
12440 "* wants int");
12441 goto onError;
12442 }
12443 prec = PyLong_AsLong(v);
12444 if (prec == -1 && PyErr_Occurred())
12445 goto onError;
12446 if (prec < 0)
12447 prec = 0;
12448 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012450 }
12451 else if (c >= '0' && c <= '9') {
12452 prec = c - '0';
12453 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 if (c < '0' || c > '9')
12456 break;
12457 if ((prec*10) / 10 != prec) {
12458 PyErr_SetString(PyExc_ValueError,
12459 "prec too big");
12460 goto onError;
12461 }
12462 prec = prec*10 + (c - '0');
12463 }
12464 }
12465 } /* prec */
12466 if (fmtcnt >= 0) {
12467 if (c == 'h' || c == 'l' || c == 'L') {
12468 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012470 }
12471 }
12472 if (fmtcnt < 0) {
12473 PyErr_SetString(PyExc_ValueError,
12474 "incomplete format");
12475 goto onError;
12476 }
12477 if (c != '%') {
12478 v = getnextarg(args, arglen, &argidx);
12479 if (v == NULL)
12480 goto onError;
12481 }
12482 sign = 0;
12483 fill = ' ';
12484 switch (c) {
12485
12486 case '%':
12487 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 len = 1;
12492 break;
12493
12494 case 's':
12495 case 'r':
12496 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012497 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 temp = v;
12499 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 }
12501 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 if (c == 's')
12503 temp = PyObject_Str(v);
12504 else if (c == 'r')
12505 temp = PyObject_Repr(v);
12506 else
12507 temp = PyObject_ASCII(v);
12508 if (temp == NULL)
12509 goto onError;
12510 if (PyUnicode_Check(temp))
12511 /* nothing to do */;
12512 else {
12513 Py_DECREF(temp);
12514 PyErr_SetString(PyExc_TypeError,
12515 "%s argument has non-string str()");
12516 goto onError;
12517 }
12518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 if (PyUnicode_READY(temp) == -1) {
12520 Py_CLEAR(temp);
12521 goto onError;
12522 }
12523 pbuf = PyUnicode_DATA(temp);
12524 kind = PyUnicode_KIND(temp);
12525 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 if (prec >= 0 && len > prec)
12527 len = prec;
12528 break;
12529
12530 case 'i':
12531 case 'd':
12532 case 'u':
12533 case 'o':
12534 case 'x':
12535 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 isnumok = 0;
12537 if (PyNumber_Check(v)) {
12538 PyObject *iobj=NULL;
12539
12540 if (PyLong_Check(v)) {
12541 iobj = v;
12542 Py_INCREF(iobj);
12543 }
12544 else {
12545 iobj = PyNumber_Long(v);
12546 }
12547 if (iobj!=NULL) {
12548 if (PyLong_Check(iobj)) {
12549 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012550 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 Py_DECREF(iobj);
12552 if (!temp)
12553 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 if (PyUnicode_READY(temp) == -1) {
12555 Py_CLEAR(temp);
12556 goto onError;
12557 }
12558 pbuf = PyUnicode_DATA(temp);
12559 kind = PyUnicode_KIND(temp);
12560 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 sign = 1;
12562 }
12563 else {
12564 Py_DECREF(iobj);
12565 }
12566 }
12567 }
12568 if (!isnumok) {
12569 PyErr_Format(PyExc_TypeError,
12570 "%%%c format: a number is required, "
12571 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12572 goto onError;
12573 }
12574 if (flags & F_ZERO)
12575 fill = '0';
12576 break;
12577
12578 case 'e':
12579 case 'E':
12580 case 'f':
12581 case 'F':
12582 case 'g':
12583 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012584 temp = formatfloat(v, flags, prec, c);
12585 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012586 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 if (PyUnicode_READY(temp) == -1) {
12588 Py_CLEAR(temp);
12589 goto onError;
12590 }
12591 pbuf = PyUnicode_DATA(temp);
12592 kind = PyUnicode_KIND(temp);
12593 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 sign = 1;
12595 if (flags & F_ZERO)
12596 fill = '0';
12597 break;
12598
12599 case 'c':
12600 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012602 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 if (len < 0)
12604 goto onError;
12605 break;
12606
12607 default:
12608 PyErr_Format(PyExc_ValueError,
12609 "unsupported format character '%c' (0x%x) "
12610 "at index %zd",
12611 (31<=c && c<=126) ? (char)c : '?',
12612 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 goto onError;
12615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 /* pbuf is initialized here. */
12617 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12620 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12621 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 len--;
12623 }
12624 else if (flags & F_SIGN)
12625 sign = '+';
12626 else if (flags & F_BLANK)
12627 sign = ' ';
12628 else
12629 sign = 0;
12630 }
12631 if (width < len)
12632 width = len;
12633 if (rescnt - (sign != 0) < width) {
12634 reslen -= rescnt;
12635 rescnt = width + fmtcnt + 100;
12636 reslen += rescnt;
12637 if (reslen < 0) {
12638 Py_XDECREF(temp);
12639 PyErr_NoMemory();
12640 goto onError;
12641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12643 if (res0 == 0) {
12644 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 Py_XDECREF(temp);
12646 goto onError;
12647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 }
12650 if (sign) {
12651 if (fill != ' ')
12652 *res++ = sign;
12653 rescnt--;
12654 if (width > len)
12655 width--;
12656 }
12657 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12659 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12662 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 }
12664 rescnt -= 2;
12665 width -= 2;
12666 if (width < 0)
12667 width = 0;
12668 len -= 2;
12669 }
12670 if (width > len && !(flags & F_LJUST)) {
12671 do {
12672 --rescnt;
12673 *res++ = fill;
12674 } while (--width > len);
12675 }
12676 if (fill == ' ') {
12677 if (sign)
12678 *res++ = sign;
12679 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12681 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12682 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12683 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012684 }
12685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 /* Copy all characters, preserving len */
12687 len1 = len;
12688 while (len1--) {
12689 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12690 rescnt--;
12691 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 while (--width >= len) {
12693 --rescnt;
12694 *res++ = ' ';
12695 }
12696 if (dict && (argidx < arglen) && c != '%') {
12697 PyErr_SetString(PyExc_TypeError,
12698 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012699 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 goto onError;
12701 }
12702 Py_XDECREF(temp);
12703 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704 } /* until end */
12705 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 PyErr_SetString(PyExc_TypeError,
12707 "not all arguments converted during string formatting");
12708 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709 }
12710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711
12712 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12713 if (*res > max)
12714 max = *res;
12715 result = PyUnicode_New(reslen - rescnt, max);
12716 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 kind = PyUnicode_KIND(result);
12719 for (res = res0; res < res0+reslen-rescnt; res++)
12720 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12721 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724 }
12725 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 return (PyObject *)result;
12727
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 Py_DECREF(uformat);
12731 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733 }
12734 return NULL;
12735}
12736
Jeremy Hylton938ace62002-07-17 16:30:39 +000012737static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012738unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12739
Tim Peters6d6c1a32001-08-02 04:15:00 +000012740static PyObject *
12741unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12742{
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 static char *kwlist[] = {"object", "encoding", "errors", 0};
12745 char *encoding = NULL;
12746 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012747
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 if (type != &PyUnicode_Type)
12749 return unicode_subtype_new(type, args, kwds);
12750 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 return NULL;
12753 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012755 if (encoding == NULL && errors == NULL)
12756 return PyObject_Str(x);
12757 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012759}
12760
Guido van Rossume023fe02001-08-30 03:12:59 +000012761static PyObject *
12762unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12763{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012764 PyUnicodeObject *unicode, *self;
12765 Py_ssize_t length, char_size;
12766 int share_wstr, share_utf8;
12767 unsigned int kind;
12768 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012769
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012771
12772 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12773 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012774 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012775 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012776 if (PyUnicode_READY(unicode))
12777 return NULL;
12778
12779 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12780 if (self == NULL) {
12781 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012782 return NULL;
12783 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012784 kind = PyUnicode_KIND(unicode);
12785 length = PyUnicode_GET_LENGTH(unicode);
12786
12787 _PyUnicode_LENGTH(self) = length;
12788 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12789 _PyUnicode_STATE(self).interned = 0;
12790 _PyUnicode_STATE(self).kind = kind;
12791 _PyUnicode_STATE(self).compact = 0;
12792 _PyUnicode_STATE(self).ascii = 0;
12793 _PyUnicode_STATE(self).ready = 1;
12794 _PyUnicode_WSTR(self) = NULL;
12795 _PyUnicode_UTF8_LENGTH(self) = 0;
12796 _PyUnicode_UTF8(self) = NULL;
12797 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012798 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012799
12800 share_utf8 = 0;
12801 share_wstr = 0;
12802 if (kind == PyUnicode_1BYTE_KIND) {
12803 char_size = 1;
12804 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12805 share_utf8 = 1;
12806 }
12807 else if (kind == PyUnicode_2BYTE_KIND) {
12808 char_size = 2;
12809 if (sizeof(wchar_t) == 2)
12810 share_wstr = 1;
12811 }
12812 else {
12813 assert(kind == PyUnicode_4BYTE_KIND);
12814 char_size = 4;
12815 if (sizeof(wchar_t) == 4)
12816 share_wstr = 1;
12817 }
12818
12819 /* Ensure we won't overflow the length. */
12820 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12821 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012823 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012824 data = PyObject_MALLOC((length + 1) * char_size);
12825 if (data == NULL) {
12826 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 goto onError;
12828 }
12829
Victor Stinnerc3c74152011-10-02 20:39:55 +020012830 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012831 if (share_utf8) {
12832 _PyUnicode_UTF8_LENGTH(self) = length;
12833 _PyUnicode_UTF8(self) = data;
12834 }
12835 if (share_wstr) {
12836 _PyUnicode_WSTR_LENGTH(self) = length;
12837 _PyUnicode_WSTR(self) = (wchar_t *)data;
12838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012840 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12841 PyUnicode_KIND_SIZE(kind, length + 1));
12842 Py_DECREF(unicode);
12843 return (PyObject *)self;
12844
12845onError:
12846 Py_DECREF(unicode);
12847 Py_DECREF(self);
12848 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012849}
12850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012851PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012853\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012854Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012855encoding defaults to the current default string encoding.\n\
12856errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012857
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012858static PyObject *unicode_iter(PyObject *seq);
12859
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012861 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 "str", /* tp_name */
12863 sizeof(PyUnicodeObject), /* tp_size */
12864 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012866 (destructor)unicode_dealloc, /* tp_dealloc */
12867 0, /* tp_print */
12868 0, /* tp_getattr */
12869 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012870 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012871 unicode_repr, /* tp_repr */
12872 &unicode_as_number, /* tp_as_number */
12873 &unicode_as_sequence, /* tp_as_sequence */
12874 &unicode_as_mapping, /* tp_as_mapping */
12875 (hashfunc) unicode_hash, /* tp_hash*/
12876 0, /* tp_call*/
12877 (reprfunc) unicode_str, /* tp_str */
12878 PyObject_GenericGetAttr, /* tp_getattro */
12879 0, /* tp_setattro */
12880 0, /* tp_as_buffer */
12881 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 unicode_doc, /* tp_doc */
12884 0, /* tp_traverse */
12885 0, /* tp_clear */
12886 PyUnicode_RichCompare, /* tp_richcompare */
12887 0, /* tp_weaklistoffset */
12888 unicode_iter, /* tp_iter */
12889 0, /* tp_iternext */
12890 unicode_methods, /* tp_methods */
12891 0, /* tp_members */
12892 0, /* tp_getset */
12893 &PyBaseObject_Type, /* tp_base */
12894 0, /* tp_dict */
12895 0, /* tp_descr_get */
12896 0, /* tp_descr_set */
12897 0, /* tp_dictoffset */
12898 0, /* tp_init */
12899 0, /* tp_alloc */
12900 unicode_new, /* tp_new */
12901 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902};
12903
12904/* Initialize the Unicode implementation */
12905
Thomas Wouters78890102000-07-22 19:25:51 +000012906void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012908 int i;
12909
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912 0x000A, /* LINE FEED */
12913 0x000D, /* CARRIAGE RETURN */
12914 0x001C, /* FILE SEPARATOR */
12915 0x001D, /* GROUP SEPARATOR */
12916 0x001E, /* RECORD SEPARATOR */
12917 0x0085, /* NEXT LINE */
12918 0x2028, /* LINE SEPARATOR */
12919 0x2029, /* PARAGRAPH SEPARATOR */
12920 };
12921
Fred Drakee4315f52000-05-09 19:53:39 +000012922 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012923 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012924 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012926
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012927 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012928 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012929 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012930 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012931
12932 /* initialize the linebreak bloom filter */
12933 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012935 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012936
12937 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938}
12939
12940/* Finalize the Unicode implementation */
12941
Christian Heimesa156e092008-02-16 07:38:31 +000012942int
12943PyUnicode_ClearFreeList(void)
12944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012946}
12947
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948void
Thomas Wouters78890102000-07-22 19:25:51 +000012949_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012951 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012953 Py_XDECREF(unicode_empty);
12954 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012955
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012956 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 if (unicode_latin1[i]) {
12958 Py_DECREF(unicode_latin1[i]);
12959 unicode_latin1[i] = NULL;
12960 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012961 }
Christian Heimesa156e092008-02-16 07:38:31 +000012962 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012964
Walter Dörwald16807132007-05-25 13:52:07 +000012965void
12966PyUnicode_InternInPlace(PyObject **p)
12967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012968 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12969 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012970#ifdef Py_DEBUG
12971 assert(s != NULL);
12972 assert(_PyUnicode_CHECK(s));
12973#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012974 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012975 return;
12976#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012977 /* If it's a subclass, we don't really know what putting
12978 it in the interned dict might do. */
12979 if (!PyUnicode_CheckExact(s))
12980 return;
12981 if (PyUnicode_CHECK_INTERNED(s))
12982 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012984 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 return;
12986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012987 if (interned == NULL) {
12988 interned = PyDict_New();
12989 if (interned == NULL) {
12990 PyErr_Clear(); /* Don't leave an exception */
12991 return;
12992 }
12993 }
12994 /* It might be that the GetItem call fails even
12995 though the key is present in the dictionary,
12996 namely when this happens during a stack overflow. */
12997 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012999 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013000
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 if (t) {
13002 Py_INCREF(t);
13003 Py_DECREF(*p);
13004 *p = t;
13005 return;
13006 }
Walter Dörwald16807132007-05-25 13:52:07 +000013007
Benjamin Peterson14339b62009-01-31 16:36:08 +000013008 PyThreadState_GET()->recursion_critical = 1;
13009 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13010 PyErr_Clear();
13011 PyThreadState_GET()->recursion_critical = 0;
13012 return;
13013 }
13014 PyThreadState_GET()->recursion_critical = 0;
13015 /* The two references in interned are not counted by refcnt.
13016 The deallocator will take care of this */
13017 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013019}
13020
13021void
13022PyUnicode_InternImmortal(PyObject **p)
13023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13025
Benjamin Peterson14339b62009-01-31 16:36:08 +000013026 PyUnicode_InternInPlace(p);
13027 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013029 Py_INCREF(*p);
13030 }
Walter Dörwald16807132007-05-25 13:52:07 +000013031}
13032
13033PyObject *
13034PyUnicode_InternFromString(const char *cp)
13035{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013036 PyObject *s = PyUnicode_FromString(cp);
13037 if (s == NULL)
13038 return NULL;
13039 PyUnicode_InternInPlace(&s);
13040 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013041}
13042
Alexander Belopolsky40018472011-02-26 01:02:56 +000013043void
13044_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013045{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013046 PyObject *keys;
13047 PyUnicodeObject *s;
13048 Py_ssize_t i, n;
13049 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013050
Benjamin Peterson14339b62009-01-31 16:36:08 +000013051 if (interned == NULL || !PyDict_Check(interned))
13052 return;
13053 keys = PyDict_Keys(interned);
13054 if (keys == NULL || !PyList_Check(keys)) {
13055 PyErr_Clear();
13056 return;
13057 }
Walter Dörwald16807132007-05-25 13:52:07 +000013058
Benjamin Peterson14339b62009-01-31 16:36:08 +000013059 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13060 detector, interned unicode strings are not forcibly deallocated;
13061 rather, we give them their stolen references back, and then clear
13062 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013063
Benjamin Peterson14339b62009-01-31 16:36:08 +000013064 n = PyList_GET_SIZE(keys);
13065 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013067 for (i = 0; i < n; i++) {
13068 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 if (PyUnicode_READY(s) == -1)
13070 fprintf(stderr, "could not ready string\n");
13071 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 case SSTATE_NOT_INTERNED:
13073 /* XXX Shouldn't happen */
13074 break;
13075 case SSTATE_INTERNED_IMMORTAL:
13076 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013078 break;
13079 case SSTATE_INTERNED_MORTAL:
13080 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013082 break;
13083 default:
13084 Py_FatalError("Inconsistent interned string state.");
13085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013087 }
13088 fprintf(stderr, "total size of all interned strings: "
13089 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13090 "mortal/immortal\n", mortal_size, immortal_size);
13091 Py_DECREF(keys);
13092 PyDict_Clear(interned);
13093 Py_DECREF(interned);
13094 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013095}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013096
13097
13098/********************* Unicode Iterator **************************/
13099
13100typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013101 PyObject_HEAD
13102 Py_ssize_t it_index;
13103 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013104} unicodeiterobject;
13105
13106static void
13107unicodeiter_dealloc(unicodeiterobject *it)
13108{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 _PyObject_GC_UNTRACK(it);
13110 Py_XDECREF(it->it_seq);
13111 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013112}
13113
13114static int
13115unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 Py_VISIT(it->it_seq);
13118 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013119}
13120
13121static PyObject *
13122unicodeiter_next(unicodeiterobject *it)
13123{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013124 PyUnicodeObject *seq;
13125 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013126
Benjamin Peterson14339b62009-01-31 16:36:08 +000013127 assert(it != NULL);
13128 seq = it->it_seq;
13129 if (seq == NULL)
13130 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013131 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13134 int kind = PyUnicode_KIND(seq);
13135 void *data = PyUnicode_DATA(seq);
13136 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13137 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 if (item != NULL)
13139 ++it->it_index;
13140 return item;
13141 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013142
Benjamin Peterson14339b62009-01-31 16:36:08 +000013143 Py_DECREF(seq);
13144 it->it_seq = NULL;
13145 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013146}
13147
13148static PyObject *
13149unicodeiter_len(unicodeiterobject *it)
13150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 Py_ssize_t len = 0;
13152 if (it->it_seq)
13153 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13154 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013155}
13156
13157PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13158
13159static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013160 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013162 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013163};
13164
13165PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13167 "str_iterator", /* tp_name */
13168 sizeof(unicodeiterobject), /* tp_basicsize */
13169 0, /* tp_itemsize */
13170 /* methods */
13171 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13172 0, /* tp_print */
13173 0, /* tp_getattr */
13174 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013175 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176 0, /* tp_repr */
13177 0, /* tp_as_number */
13178 0, /* tp_as_sequence */
13179 0, /* tp_as_mapping */
13180 0, /* tp_hash */
13181 0, /* tp_call */
13182 0, /* tp_str */
13183 PyObject_GenericGetAttr, /* tp_getattro */
13184 0, /* tp_setattro */
13185 0, /* tp_as_buffer */
13186 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13187 0, /* tp_doc */
13188 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13189 0, /* tp_clear */
13190 0, /* tp_richcompare */
13191 0, /* tp_weaklistoffset */
13192 PyObject_SelfIter, /* tp_iter */
13193 (iternextfunc)unicodeiter_next, /* tp_iternext */
13194 unicodeiter_methods, /* tp_methods */
13195 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013196};
13197
13198static PyObject *
13199unicode_iter(PyObject *seq)
13200{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013202
Benjamin Peterson14339b62009-01-31 16:36:08 +000013203 if (!PyUnicode_Check(seq)) {
13204 PyErr_BadInternalCall();
13205 return NULL;
13206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 if (PyUnicode_READY(seq) == -1)
13208 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13210 if (it == NULL)
13211 return NULL;
13212 it->it_index = 0;
13213 Py_INCREF(seq);
13214 it->it_seq = (PyUnicodeObject *)seq;
13215 _PyObject_GC_TRACK(it);
13216 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013217}
13218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219#define UNIOP(x) Py_UNICODE_##x
13220#define UNIOP_t Py_UNICODE
13221#include "uniops.h"
13222#undef UNIOP
13223#undef UNIOP_t
13224#define UNIOP(x) Py_UCS4_##x
13225#define UNIOP_t Py_UCS4
13226#include "uniops.h"
13227#undef UNIOP
13228#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013229
Victor Stinner71133ff2010-09-01 23:43:53 +000013230Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013231PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013232{
13233 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13234 Py_UNICODE *copy;
13235 Py_ssize_t size;
13236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 if (!PyUnicode_Check(unicode)) {
13238 PyErr_BadArgument();
13239 return NULL;
13240 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013241 /* Ensure we won't overflow the size. */
13242 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13243 PyErr_NoMemory();
13244 return NULL;
13245 }
13246 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13247 size *= sizeof(Py_UNICODE);
13248 copy = PyMem_Malloc(size);
13249 if (copy == NULL) {
13250 PyErr_NoMemory();
13251 return NULL;
13252 }
13253 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13254 return copy;
13255}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013256
Georg Brandl66c221e2010-10-14 07:04:07 +000013257/* A _string module, to export formatter_parser and formatter_field_name_split
13258 to the string.Formatter class implemented in Python. */
13259
13260static PyMethodDef _string_methods[] = {
13261 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13262 METH_O, PyDoc_STR("split the argument as a field name")},
13263 {"formatter_parser", (PyCFunction) formatter_parser,
13264 METH_O, PyDoc_STR("parse the argument as a format string")},
13265 {NULL, NULL}
13266};
13267
13268static struct PyModuleDef _string_module = {
13269 PyModuleDef_HEAD_INIT,
13270 "_string",
13271 PyDoc_STR("string helper module"),
13272 0,
13273 _string_methods,
13274 NULL,
13275 NULL,
13276 NULL,
13277 NULL
13278};
13279
13280PyMODINIT_FUNC
13281PyInit__string(void)
13282{
13283 return PyModule_Create(&_string_module);
13284}
13285
13286
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013287#ifdef __cplusplus
13288}
13289#endif