blob: 9c02817b9da4ff8cc89912799f6fd2e54f24716c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
446 int share_wstr;
447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
453
454 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
455 PyErr_NoMemory();
456 return -1;
457 }
458 new_size = (length + 1) * char_size;
459
460 data = (PyObject *)PyObject_REALLOC(data, new_size);
461 if (data == NULL) {
462 PyErr_NoMemory();
463 return -1;
464 }
465 _PyUnicode_DATA_ANY(unicode) = data;
466 if (share_wstr)
467 _PyUnicode_WSTR(unicode) = data;
468 _PyUnicode_LENGTH(unicode) = length;
469 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
470 if (share_wstr)
471 return 0;
472 }
473 if (_PyUnicode_WSTR(unicode) != NULL) {
474 assert(_PyUnicode_WSTR(unicode) != NULL);
475
476 oldstr = _PyUnicode_WSTR(unicode);
477 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
478 sizeof(Py_UNICODE) * (length + 1));
479 if (!_PyUnicode_WSTR(unicode)) {
480 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
481 PyErr_NoMemory();
482 return -1;
483 }
484 _PyUnicode_WSTR(unicode)[length] = 0;
485 _PyUnicode_WSTR_LENGTH(unicode) = length;
486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487 return 0;
488}
489
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490static PyObject*
491resize_copy(PyObject *unicode, Py_ssize_t length)
492{
493 Py_ssize_t copy_length;
494 if (PyUnicode_IS_COMPACT(unicode)) {
495 PyObject *copy;
496 assert(PyUnicode_IS_READY(unicode));
497
498 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
499 if (copy == NULL)
500 return NULL;
501
502 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
503 if (PyUnicode_CopyCharacters(copy, 0,
504 unicode, 0,
505 copy_length) < 0)
506 {
507 Py_DECREF(copy);
508 return NULL;
509 }
510 return copy;
511 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200512 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200513 assert(_PyUnicode_WSTR(unicode) != NULL);
514 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200515 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200516 if (w == NULL)
517 return NULL;
518 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
519 copy_length = Py_MIN(copy_length, length);
520 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
521 copy_length);
522 return (PyObject*)w;
523 }
524}
525
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000527 Ux0000 terminated; some code (e.g. new_identifier)
528 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529
530 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532
533*/
534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535#ifdef Py_DEBUG
536int unicode_old_new_calls = 0;
537#endif
538
Alexander Belopolsky40018472011-02-26 01:02:56 +0000539static PyUnicodeObject *
540_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541{
542 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 if (length == 0 && unicode_empty != NULL) {
547 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200548 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 }
550
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000551 /* Ensure we won't overflow the size. */
552 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
553 return (PyUnicodeObject *)PyErr_NoMemory();
554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 if (length < 0) {
556 PyErr_SetString(PyExc_SystemError,
557 "Negative size passed to _PyUnicode_New");
558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 }
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#ifdef Py_DEBUG
562 ++unicode_old_new_calls;
563#endif
564
565 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
566 if (unicode == NULL)
567 return NULL;
568 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
569 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
570 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000571 PyErr_NoMemory();
572 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574
Jeremy Hyltond8082792003-09-16 19:41:39 +0000575 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000576 * the caller fails before initializing str -- unicode_resize()
577 * reads str[0], and the Keep-Alive optimization can keep memory
578 * allocated for str alive across a call to unicode_dealloc(unicode).
579 * We don't want unicode_resize to read uninitialized memory in
580 * that case.
581 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582 _PyUnicode_WSTR(unicode)[0] = 0;
583 _PyUnicode_WSTR(unicode)[length] = 0;
584 _PyUnicode_WSTR_LENGTH(unicode) = length;
585 _PyUnicode_HASH(unicode) = -1;
586 _PyUnicode_STATE(unicode).interned = 0;
587 _PyUnicode_STATE(unicode).kind = 0;
588 _PyUnicode_STATE(unicode).compact = 0;
589 _PyUnicode_STATE(unicode).ready = 0;
590 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200591 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200593 _PyUnicode_UTF8(unicode) = NULL;
594 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000596
Benjamin Peterson29060642009-01-31 22:14:21 +0000597 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000598 /* XXX UNREF/NEWREF interface should be more symmetrical */
599 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000600 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000601 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603}
604
Victor Stinnerf42dc442011-10-02 23:33:16 +0200605static const char*
606unicode_kind_name(PyObject *unicode)
607{
Victor Stinner910337b2011-10-03 03:20:16 +0200608 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200609 if (!PyUnicode_IS_COMPACT(unicode))
610 {
611 if (!PyUnicode_IS_READY(unicode))
612 return "wstr";
613 switch(PyUnicode_KIND(unicode))
614 {
615 case PyUnicode_1BYTE_KIND:
616 if (PyUnicode_IS_COMPACT_ASCII(unicode))
617 return "legacy ascii";
618 else
619 return "legacy latin1";
620 case PyUnicode_2BYTE_KIND:
621 return "legacy UCS2";
622 case PyUnicode_4BYTE_KIND:
623 return "legacy UCS4";
624 default:
625 return "<legacy invalid kind>";
626 }
627 }
628 assert(PyUnicode_IS_READY(unicode));
629 switch(PyUnicode_KIND(unicode))
630 {
631 case PyUnicode_1BYTE_KIND:
632 if (PyUnicode_IS_COMPACT_ASCII(unicode))
633 return "ascii";
634 else
635 return "compact latin1";
636 case PyUnicode_2BYTE_KIND:
637 return "compact UCS2";
638 case PyUnicode_4BYTE_KIND:
639 return "compact UCS4";
640 default:
641 return "<invalid compact kind>";
642 }
643}
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645#ifdef Py_DEBUG
646int unicode_new_new_calls = 0;
647
648/* Functions wrapping macros for use in debugger */
649char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200650 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651}
652
653void *_PyUnicode_compact_data(void *unicode) {
654 return _PyUnicode_COMPACT_DATA(unicode);
655}
656void *_PyUnicode_data(void *unicode){
657 printf("obj %p\n", unicode);
658 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
659 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
660 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
661 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
662 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
663 return PyUnicode_DATA(unicode);
664}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200665
666void
667_PyUnicode_Dump(PyObject *op)
668{
669 PyASCIIObject *ascii = (PyASCIIObject *)op;
670 printf("%s: len=%zu, wstr=%p",
671 unicode_kind_name(op),
672 ascii->length,
673 ascii->wstr);
674 if (!ascii->state.ascii) {
675 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
676 printf(" (%zu), utf8=%p (%zu)",
677 compact->wstr_length,
678 compact->utf8,
679 compact->utf8_length);
680 }
681 if (!ascii->state.compact) {
682 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
683 printf(", data=%p",
684 unicode->data.any);
685 }
686 printf("\n");
687}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200688#endif
689
690PyObject *
691PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
692{
693 PyObject *obj;
694 PyCompactUnicodeObject *unicode;
695 void *data;
696 int kind_state;
697 int is_sharing = 0, is_ascii = 0;
698 Py_ssize_t char_size;
699 Py_ssize_t struct_size;
700
701 /* Optimization for empty strings */
702 if (size == 0 && unicode_empty != NULL) {
703 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200704 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705 }
706
707#ifdef Py_DEBUG
708 ++unicode_new_new_calls;
709#endif
710
711 struct_size = sizeof(PyCompactUnicodeObject);
712 if (maxchar < 128) {
713 kind_state = PyUnicode_1BYTE_KIND;
714 char_size = 1;
715 is_ascii = 1;
716 struct_size = sizeof(PyASCIIObject);
717 }
718 else if (maxchar < 256) {
719 kind_state = PyUnicode_1BYTE_KIND;
720 char_size = 1;
721 }
722 else if (maxchar < 65536) {
723 kind_state = PyUnicode_2BYTE_KIND;
724 char_size = 2;
725 if (sizeof(wchar_t) == 2)
726 is_sharing = 1;
727 }
728 else {
729 kind_state = PyUnicode_4BYTE_KIND;
730 char_size = 4;
731 if (sizeof(wchar_t) == 4)
732 is_sharing = 1;
733 }
734
735 /* Ensure we won't overflow the size. */
736 if (size < 0) {
737 PyErr_SetString(PyExc_SystemError,
738 "Negative size passed to PyUnicode_New");
739 return NULL;
740 }
741 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
742 return PyErr_NoMemory();
743
744 /* Duplicated allocation code from _PyObject_New() instead of a call to
745 * PyObject_New() so we are able to allocate space for the object and
746 * it's data buffer.
747 */
748 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
749 if (obj == NULL)
750 return PyErr_NoMemory();
751 obj = PyObject_INIT(obj, &PyUnicode_Type);
752 if (obj == NULL)
753 return NULL;
754
755 unicode = (PyCompactUnicodeObject *)obj;
756 if (is_ascii)
757 data = ((PyASCIIObject*)obj) + 1;
758 else
759 data = unicode + 1;
760 _PyUnicode_LENGTH(unicode) = size;
761 _PyUnicode_HASH(unicode) = -1;
762 _PyUnicode_STATE(unicode).interned = 0;
763 _PyUnicode_STATE(unicode).kind = kind_state;
764 _PyUnicode_STATE(unicode).compact = 1;
765 _PyUnicode_STATE(unicode).ready = 1;
766 _PyUnicode_STATE(unicode).ascii = is_ascii;
767 if (is_ascii) {
768 ((char*)data)[size] = 0;
769 _PyUnicode_WSTR(unicode) = NULL;
770 }
771 else if (kind_state == PyUnicode_1BYTE_KIND) {
772 ((char*)data)[size] = 0;
773 _PyUnicode_WSTR(unicode) = NULL;
774 _PyUnicode_WSTR_LENGTH(unicode) = 0;
775 unicode->utf8_length = 0;
776 unicode->utf8 = NULL;
777 }
778 else {
779 unicode->utf8 = NULL;
780 if (kind_state == PyUnicode_2BYTE_KIND)
781 ((Py_UCS2*)data)[size] = 0;
782 else /* kind_state == PyUnicode_4BYTE_KIND */
783 ((Py_UCS4*)data)[size] = 0;
784 if (is_sharing) {
785 _PyUnicode_WSTR_LENGTH(unicode) = size;
786 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
787 }
788 else {
789 _PyUnicode_WSTR_LENGTH(unicode) = 0;
790 _PyUnicode_WSTR(unicode) = NULL;
791 }
792 }
793 return obj;
794}
795
796#if SIZEOF_WCHAR_T == 2
797/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
798 will decode surrogate pairs, the other conversions are implemented as macros
799 for efficency.
800
801 This function assumes that unicode can hold one more code point than wstr
802 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200803static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
805 PyUnicodeObject *unicode)
806{
807 const wchar_t *iter;
808 Py_UCS4 *ucs4_out;
809
Victor Stinner910337b2011-10-03 03:20:16 +0200810 assert(unicode != NULL);
811 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
813 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
814
815 for (iter = begin; iter < end; ) {
816 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
817 _PyUnicode_GET_LENGTH(unicode)));
818 if (*iter >= 0xD800 && *iter <= 0xDBFF
819 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
820 {
821 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
822 iter += 2;
823 }
824 else {
825 *ucs4_out++ = *iter;
826 iter++;
827 }
828 }
829 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
830 _PyUnicode_GET_LENGTH(unicode)));
831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832}
833#endif
834
Victor Stinnercd9950f2011-10-02 00:34:53 +0200835static int
836_PyUnicode_Dirty(PyObject *unicode)
837{
Victor Stinner910337b2011-10-03 03:20:16 +0200838 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200839 if (Py_REFCNT(unicode) != 1) {
840 PyErr_SetString(PyExc_ValueError,
841 "Cannot modify a string having more than 1 reference");
842 return -1;
843 }
844 _PyUnicode_DIRTY(unicode);
845 return 0;
846}
847
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200848Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
850 PyObject *from, Py_ssize_t from_start,
851 Py_ssize_t how_many)
852{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200853 unsigned int from_kind, to_kind;
854 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855
Victor Stinnerb1536152011-09-30 02:26:10 +0200856 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
857 PyErr_BadInternalCall();
858 return -1;
859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860
861 if (PyUnicode_READY(from))
862 return -1;
863 if (PyUnicode_READY(to))
864 return -1;
865
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200866 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200867 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
868 PyErr_Format(PyExc_ValueError,
869 "Cannot write %zi characters at %zi "
870 "in a string of %zi characters",
871 how_many, to_start, PyUnicode_GET_LENGTH(to));
872 return -1;
873 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200874 if (how_many == 0)
875 return 0;
876
Victor Stinnercd9950f2011-10-02 00:34:53 +0200877 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200878 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200881 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200883 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 if (from_kind == to_kind
886 /* deny latin1 => ascii */
887 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
888 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200889 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200890 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200891 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200892 + PyUnicode_KIND_SIZE(from_kind, from_start),
893 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200895 else if (from_kind == PyUnicode_1BYTE_KIND
896 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200897 {
898 _PyUnicode_CONVERT_BYTES(
899 Py_UCS1, Py_UCS2,
900 PyUnicode_1BYTE_DATA(from) + from_start,
901 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
902 PyUnicode_2BYTE_DATA(to) + to_start
903 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200904 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200905 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200906 && to_kind == PyUnicode_4BYTE_KIND)
907 {
908 _PyUnicode_CONVERT_BYTES(
909 Py_UCS1, Py_UCS4,
910 PyUnicode_1BYTE_DATA(from) + from_start,
911 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
912 PyUnicode_4BYTE_DATA(to) + to_start
913 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200914 }
915 else if (from_kind == PyUnicode_2BYTE_KIND
916 && to_kind == PyUnicode_4BYTE_KIND)
917 {
918 _PyUnicode_CONVERT_BYTES(
919 Py_UCS2, Py_UCS4,
920 PyUnicode_2BYTE_DATA(from) + from_start,
921 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
922 PyUnicode_4BYTE_DATA(to) + to_start
923 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200924 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200925 else {
926 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200927
928 /* check if max_char(from substring) <= max_char(to) */
929 if (from_kind > to_kind
930 /* latin1 => ascii */
931 || (PyUnicode_IS_COMPACT_ASCII(to)
932 && to_kind == PyUnicode_1BYTE_KIND
933 && !PyUnicode_IS_COMPACT_ASCII(from)))
934 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200935 /* slow path to check for character overflow */
936 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
937 Py_UCS4 ch, maxchar;
938 Py_ssize_t i;
939
940 maxchar = 0;
941 invalid_kinds = 0;
942 for (i=0; i < how_many; i++) {
943 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
944 if (ch > maxchar) {
945 maxchar = ch;
946 if (maxchar > to_maxchar) {
947 invalid_kinds = 1;
948 break;
949 }
950 }
951 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
952 }
953 }
954 else
955 invalid_kinds = 1;
956 if (invalid_kinds) {
957 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958 "Cannot copy %s characters "
959 "into a string of %s characters",
960 unicode_kind_name(from),
961 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200962 return -1;
963 }
964 }
965 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966}
967
Victor Stinner17222162011-09-28 22:15:37 +0200968/* Find the maximum code point and count the number of surrogate pairs so a
969 correct string length can be computed before converting a string to UCS4.
970 This function counts single surrogates as a character and not as a pair.
971
972 Return 0 on success, or -1 on error. */
973static int
974find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
975 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976{
977 const wchar_t *iter;
978
Victor Stinnerc53be962011-10-02 21:33:54 +0200979 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 if (num_surrogates == NULL || maxchar == NULL) {
981 PyErr_SetString(PyExc_SystemError,
982 "unexpected NULL arguments to "
983 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
984 return -1;
985 }
986
987 *num_surrogates = 0;
988 *maxchar = 0;
989
990 for (iter = begin; iter < end; ) {
991 if (*iter > *maxchar)
992 *maxchar = *iter;
993#if SIZEOF_WCHAR_T == 2
994 if (*iter >= 0xD800 && *iter <= 0xDBFF
995 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
996 {
997 Py_UCS4 surrogate_val;
998 surrogate_val = (((iter[0] & 0x3FF)<<10)
999 | (iter[1] & 0x3FF)) + 0x10000;
1000 ++(*num_surrogates);
1001 if (surrogate_val > *maxchar)
1002 *maxchar = surrogate_val;
1003 iter += 2;
1004 }
1005 else
1006 iter++;
1007#else
1008 iter++;
1009#endif
1010 }
1011 return 0;
1012}
1013
1014#ifdef Py_DEBUG
1015int unicode_ready_calls = 0;
1016#endif
1017
1018int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001019_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001021 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 wchar_t *end;
1023 Py_UCS4 maxchar = 0;
1024 Py_ssize_t num_surrogates;
1025#if SIZEOF_WCHAR_T == 2
1026 Py_ssize_t length_wo_surrogates;
1027#endif
1028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001030 strings were created using _PyObject_New() and where no canonical
1031 representation (the str field) has been set yet aka strings
1032 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001033 assert(_PyUnicode_CHECK(unicode));
1034 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001036 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001037 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001038 /* Actually, it should neither be interned nor be anything else: */
1039 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040
1041#ifdef Py_DEBUG
1042 ++unicode_ready_calls;
1043#endif
1044
1045 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001046 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001047 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049
1050 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001051 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1052 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 PyErr_NoMemory();
1054 return -1;
1055 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001056 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 _PyUnicode_WSTR(unicode), end,
1058 PyUnicode_1BYTE_DATA(unicode));
1059 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1060 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1061 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1062 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001063 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001064 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 }
1066 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001067 _PyUnicode_UTF8(unicode) = NULL;
1068 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 }
1070 PyObject_FREE(_PyUnicode_WSTR(unicode));
1071 _PyUnicode_WSTR(unicode) = NULL;
1072 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1073 }
1074 /* In this case we might have to convert down from 4-byte native
1075 wchar_t to 2-byte unicode. */
1076 else if (maxchar < 65536) {
1077 assert(num_surrogates == 0 &&
1078 "FindMaxCharAndNumSurrogatePairs() messed up");
1079
Victor Stinner506f5922011-09-28 22:34:18 +02001080#if SIZEOF_WCHAR_T == 2
1081 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001082 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001083 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1084 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1085 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001086 _PyUnicode_UTF8(unicode) = NULL;
1087 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001088#else
1089 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001090 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001091 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001092 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001093 PyErr_NoMemory();
1094 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 }
Victor Stinner506f5922011-09-28 22:34:18 +02001096 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1097 _PyUnicode_WSTR(unicode), end,
1098 PyUnicode_2BYTE_DATA(unicode));
1099 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1100 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1101 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001104 PyObject_FREE(_PyUnicode_WSTR(unicode));
1105 _PyUnicode_WSTR(unicode) = NULL;
1106 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1107#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 }
1109 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1110 else {
1111#if SIZEOF_WCHAR_T == 2
1112 /* in case the native representation is 2-bytes, we need to allocate a
1113 new normalized 4-byte version. */
1114 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001115 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1116 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 PyErr_NoMemory();
1118 return -1;
1119 }
1120 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1121 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001122 _PyUnicode_UTF8(unicode) = NULL;
1123 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001124 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1125 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001126 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 PyObject_FREE(_PyUnicode_WSTR(unicode));
1128 _PyUnicode_WSTR(unicode) = NULL;
1129 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1130#else
1131 assert(num_surrogates == 0);
1132
Victor Stinnerc3c74152011-10-02 20:39:55 +02001133 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001135 _PyUnicode_UTF8(unicode) = NULL;
1136 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1138#endif
1139 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1140 }
1141 _PyUnicode_STATE(unicode).ready = 1;
1142 return 0;
1143}
1144
Alexander Belopolsky40018472011-02-26 01:02:56 +00001145static void
1146unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147{
Walter Dörwald16807132007-05-25 13:52:07 +00001148 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 case SSTATE_NOT_INTERNED:
1150 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001151
Benjamin Peterson29060642009-01-31 22:14:21 +00001152 case SSTATE_INTERNED_MORTAL:
1153 /* revive dead object temporarily for DelItem */
1154 Py_REFCNT(unicode) = 3;
1155 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1156 Py_FatalError(
1157 "deletion of interned string failed");
1158 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001159
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 case SSTATE_INTERNED_IMMORTAL:
1161 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001162
Benjamin Peterson29060642009-01-31 22:14:21 +00001163 default:
1164 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001165 }
1166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 if (_PyUnicode_WSTR(unicode) &&
1168 (!PyUnicode_IS_READY(unicode) ||
1169 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1170 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001171 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001172 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173
1174 if (PyUnicode_IS_COMPACT(unicode)) {
1175 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 }
1177 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 if (_PyUnicode_DATA_ANY(unicode))
1179 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001180 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 }
1182}
1183
Alexander Belopolsky40018472011-02-26 01:02:56 +00001184static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001186{
Victor Stinnera3be6132011-10-03 02:16:37 +02001187 Py_ssize_t len;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188 if (Py_REFCNT(unicode) != 1)
1189 return 0;
1190 if (PyUnicode_CHECK_INTERNED(unicode))
1191 return 0;
1192 if (unicode == unicode_empty)
1193 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001194 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1195 len = PyUnicode_WSTR_LENGTH(unicode);
1196 else
1197 len = PyUnicode_GET_LENGTH(unicode);
1198 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001199 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001200 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001202 else
1203 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001204 if (ch < 256 && unicode_latin1[ch] == unicode)
1205 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001206 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001207 /* FIXME: reenable resize_inplace */
1208 if (!PyUnicode_IS_COMPACT(unicode))
1209 return 0;
1210 return 1;
1211}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001212
Victor Stinnerfe226c02011-10-03 03:52:20 +02001213static int
1214unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1215{
1216 PyObject *unicode;
1217 Py_ssize_t old_length;
1218
1219 assert(p_unicode != NULL);
1220 unicode = *p_unicode;
1221
1222 assert(unicode != NULL);
1223 assert(PyUnicode_Check(unicode));
1224 assert(0 <= length);
1225
Victor Stinner910337b2011-10-03 03:20:16 +02001226 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001227 old_length = PyUnicode_WSTR_LENGTH(unicode);
1228 else
1229 old_length = PyUnicode_GET_LENGTH(unicode);
1230 if (old_length == length)
1231 return 0;
1232
1233 /* FIXME: really create a new object? */
1234 if (!unicode_resizable(unicode)) {
1235 PyObject *copy = resize_copy(unicode, length);
1236 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001237 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001238 Py_DECREF(*p_unicode);
1239 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001240 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001241 }
1242
Victor Stinnerfe226c02011-10-03 03:52:20 +02001243 if (PyUnicode_IS_COMPACT(unicode)) {
1244 *p_unicode = resize_compact(unicode, length);
1245 if (*p_unicode == NULL)
1246 return -1;
1247 return 0;
1248 } else
1249 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001250}
1251
Alexander Belopolsky40018472011-02-26 01:02:56 +00001252int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001253PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001254{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001255 PyObject *unicode;
1256 if (p_unicode == NULL) {
1257 PyErr_BadInternalCall();
1258 return -1;
1259 }
1260 unicode = *p_unicode;
1261 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1262 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1263 {
1264 PyErr_BadInternalCall();
1265 return -1;
1266 }
1267 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001268}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001270static PyObject*
1271get_latin1_char(unsigned char ch)
1272{
Victor Stinnera464fc12011-10-02 20:39:30 +02001273 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001274 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001275 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001276 if (!unicode)
1277 return NULL;
1278 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1279 unicode_latin1[ch] = unicode;
1280 }
1281 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001282 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283}
1284
Alexander Belopolsky40018472011-02-26 01:02:56 +00001285PyObject *
1286PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 Py_UCS4 maxchar = 0;
1290 Py_ssize_t num_surrogates;
1291
1292 if (u == NULL)
1293 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001295 /* If the Unicode data is known at construction time, we can apply
1296 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001298 /* Optimization for empty strings */
1299 if (size == 0 && unicode_empty != NULL) {
1300 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001301 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001302 }
Tim Petersced69f82003-09-16 20:30:58 +00001303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 /* Single character Unicode objects in the Latin-1 range are
1305 shared when using this constructor */
1306 if (size == 1 && *u < 256)
1307 return get_latin1_char((unsigned char)*u);
1308
1309 /* If not empty and not single character, copy the Unicode data
1310 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001311 if (find_maxchar_surrogates(u, u + size,
1312 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 return NULL;
1314
1315 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1316 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317 if (!unicode)
1318 return NULL;
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 switch (PyUnicode_KIND(unicode)) {
1321 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001322 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1324 break;
1325 case PyUnicode_2BYTE_KIND:
1326#if Py_UNICODE_SIZE == 2
1327 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1328#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001329 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1331#endif
1332 break;
1333 case PyUnicode_4BYTE_KIND:
1334#if SIZEOF_WCHAR_T == 2
1335 /* This is the only case which has to process surrogates, thus
1336 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001337 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338#else
1339 assert(num_surrogates == 0);
1340 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1341#endif
1342 break;
1343 default:
1344 assert(0 && "Impossible state");
1345 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346
1347 return (PyObject *)unicode;
1348}
1349
Alexander Belopolsky40018472011-02-26 01:02:56 +00001350PyObject *
1351PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001352{
1353 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001354
Benjamin Peterson14339b62009-01-31 16:36:08 +00001355 if (size < 0) {
1356 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001357 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001358 return NULL;
1359 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001360
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001361 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001362 some optimizations which share commonly used objects.
1363 Also, this means the input must be UTF-8, so fall back to the
1364 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001365 if (u != NULL) {
1366
Benjamin Peterson29060642009-01-31 22:14:21 +00001367 /* Optimization for empty strings */
1368 if (size == 0 && unicode_empty != NULL) {
1369 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001370 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001371 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001372
1373 /* Single characters are shared when using this constructor.
1374 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 if (size == 1 && Py_CHARMASK(*u) < 128)
1376 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001377
1378 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001379 }
1380
Walter Dörwald55507312007-05-18 13:12:10 +00001381 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001382 if (!unicode)
1383 return NULL;
1384
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001385 return (PyObject *)unicode;
1386}
1387
Alexander Belopolsky40018472011-02-26 01:02:56 +00001388PyObject *
1389PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001390{
1391 size_t size = strlen(u);
1392 if (size > PY_SSIZE_T_MAX) {
1393 PyErr_SetString(PyExc_OverflowError, "input too long");
1394 return NULL;
1395 }
1396
1397 return PyUnicode_FromStringAndSize(u, size);
1398}
1399
Victor Stinnere57b1c02011-09-28 22:20:48 +02001400static PyObject*
1401_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001402{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 PyObject *res;
1404 unsigned char max = 127;
1405 Py_ssize_t i;
1406 for (i = 0; i < size; i++) {
1407 if (u[i] & 0x80) {
1408 max = 255;
1409 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001410 }
1411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 res = PyUnicode_New(size, max);
1413 if (!res)
1414 return NULL;
1415 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1416 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001417}
1418
Victor Stinnere57b1c02011-09-28 22:20:48 +02001419static PyObject*
1420_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421{
1422 PyObject *res;
1423 Py_UCS2 max = 0;
1424 Py_ssize_t i;
1425 for (i = 0; i < size; i++)
1426 if (u[i] > max)
1427 max = u[i];
1428 res = PyUnicode_New(size, max);
1429 if (!res)
1430 return NULL;
1431 if (max >= 256)
1432 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1433 else
1434 for (i = 0; i < size; i++)
1435 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1436 return res;
1437}
1438
Victor Stinnere57b1c02011-09-28 22:20:48 +02001439static PyObject*
1440_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441{
1442 PyObject *res;
1443 Py_UCS4 max = 0;
1444 Py_ssize_t i;
1445 for (i = 0; i < size; i++)
1446 if (u[i] > max)
1447 max = u[i];
1448 res = PyUnicode_New(size, max);
1449 if (!res)
1450 return NULL;
1451 if (max >= 0x10000)
1452 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1453 else {
1454 int kind = PyUnicode_KIND(res);
1455 void *data = PyUnicode_DATA(res);
1456 for (i = 0; i < size; i++)
1457 PyUnicode_WRITE(kind, data, i, u[i]);
1458 }
1459 return res;
1460}
1461
1462PyObject*
1463PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1464{
1465 switch(kind) {
1466 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001467 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001469 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001471 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001473 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return NULL;
1475}
1476
Victor Stinner034f6cf2011-09-30 02:26:44 +02001477PyObject*
1478PyUnicode_Copy(PyObject *unicode)
1479{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001480 Py_ssize_t size;
1481 PyObject *copy;
1482 void *data;
1483
Victor Stinner034f6cf2011-09-30 02:26:44 +02001484 if (!PyUnicode_Check(unicode)) {
1485 PyErr_BadInternalCall();
1486 return NULL;
1487 }
1488 if (PyUnicode_READY(unicode))
1489 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001490
1491 size = PyUnicode_GET_LENGTH(unicode);
1492 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1493 if (!copy)
1494 return NULL;
1495 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1496
1497 data = PyUnicode_DATA(unicode);
1498 switch (PyUnicode_KIND(unicode))
1499 {
1500 case PyUnicode_1BYTE_KIND:
1501 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1502 break;
1503 case PyUnicode_2BYTE_KIND:
1504 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1505 break;
1506 case PyUnicode_4BYTE_KIND:
1507 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1508 break;
1509 default:
1510 assert(0);
1511 break;
1512 }
1513 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001514}
1515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516
Victor Stinnerbc603d12011-10-02 01:00:40 +02001517/* Widen Unicode objects to larger buffers. Don't write terminating null
1518 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519
1520void*
1521_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1522{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001523 Py_ssize_t len;
1524 void *result;
1525 unsigned int skind;
1526
1527 if (PyUnicode_READY(s))
1528 return NULL;
1529
1530 len = PyUnicode_GET_LENGTH(s);
1531 skind = PyUnicode_KIND(s);
1532 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1534 return NULL;
1535 }
1536 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001537 case PyUnicode_2BYTE_KIND:
1538 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1539 if (!result)
1540 return PyErr_NoMemory();
1541 assert(skind == PyUnicode_1BYTE_KIND);
1542 _PyUnicode_CONVERT_BYTES(
1543 Py_UCS1, Py_UCS2,
1544 PyUnicode_1BYTE_DATA(s),
1545 PyUnicode_1BYTE_DATA(s) + len,
1546 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001548 case PyUnicode_4BYTE_KIND:
1549 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1550 if (!result)
1551 return PyErr_NoMemory();
1552 if (skind == PyUnicode_2BYTE_KIND) {
1553 _PyUnicode_CONVERT_BYTES(
1554 Py_UCS2, Py_UCS4,
1555 PyUnicode_2BYTE_DATA(s),
1556 PyUnicode_2BYTE_DATA(s) + len,
1557 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001559 else {
1560 assert(skind == PyUnicode_1BYTE_KIND);
1561 _PyUnicode_CONVERT_BYTES(
1562 Py_UCS1, Py_UCS4,
1563 PyUnicode_1BYTE_DATA(s),
1564 PyUnicode_1BYTE_DATA(s) + len,
1565 result);
1566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001568 default:
1569 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001571 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572 return NULL;
1573}
1574
1575static Py_UCS4*
1576as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1577 int copy_null)
1578{
1579 int kind;
1580 void *data;
1581 Py_ssize_t len, targetlen;
1582 if (PyUnicode_READY(string) == -1)
1583 return NULL;
1584 kind = PyUnicode_KIND(string);
1585 data = PyUnicode_DATA(string);
1586 len = PyUnicode_GET_LENGTH(string);
1587 targetlen = len;
1588 if (copy_null)
1589 targetlen++;
1590 if (!target) {
1591 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1592 PyErr_NoMemory();
1593 return NULL;
1594 }
1595 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1596 if (!target) {
1597 PyErr_NoMemory();
1598 return NULL;
1599 }
1600 }
1601 else {
1602 if (targetsize < targetlen) {
1603 PyErr_Format(PyExc_SystemError,
1604 "string is longer than the buffer");
1605 if (copy_null && 0 < targetsize)
1606 target[0] = 0;
1607 return NULL;
1608 }
1609 }
1610 if (kind != PyUnicode_4BYTE_KIND) {
1611 Py_ssize_t i;
1612 for (i = 0; i < len; i++)
1613 target[i] = PyUnicode_READ(kind, data, i);
1614 }
1615 else
1616 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1617 if (copy_null)
1618 target[len] = 0;
1619 return target;
1620}
1621
1622Py_UCS4*
1623PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1624 int copy_null)
1625{
1626 if (target == NULL || targetsize < 1) {
1627 PyErr_BadInternalCall();
1628 return NULL;
1629 }
1630 return as_ucs4(string, target, targetsize, copy_null);
1631}
1632
1633Py_UCS4*
1634PyUnicode_AsUCS4Copy(PyObject *string)
1635{
1636 return as_ucs4(string, NULL, 0, 1);
1637}
1638
1639#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001640
Alexander Belopolsky40018472011-02-26 01:02:56 +00001641PyObject *
1642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001645 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001647 PyErr_BadInternalCall();
1648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649 }
1650
Martin v. Löwis790465f2008-04-05 20:41:37 +00001651 if (size == -1) {
1652 size = wcslen(w);
1653 }
1654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656}
1657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001659
Walter Dörwald346737f2007-05-31 10:44:43 +00001660static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001661makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1662 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001663{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001664 *fmt++ = '%';
1665 if (width) {
1666 if (zeropad)
1667 *fmt++ = '0';
1668 fmt += sprintf(fmt, "%d", width);
1669 }
1670 if (precision)
1671 fmt += sprintf(fmt, ".%d", precision);
1672 if (longflag)
1673 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001674 else if (longlongflag) {
1675 /* longlongflag should only ever be nonzero on machines with
1676 HAVE_LONG_LONG defined */
1677#ifdef HAVE_LONG_LONG
1678 char *f = PY_FORMAT_LONG_LONG;
1679 while (*f)
1680 *fmt++ = *f++;
1681#else
1682 /* we shouldn't ever get here */
1683 assert(0);
1684 *fmt++ = 'l';
1685#endif
1686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001687 else if (size_tflag) {
1688 char *f = PY_FORMAT_SIZE_T;
1689 while (*f)
1690 *fmt++ = *f++;
1691 }
1692 *fmt++ = c;
1693 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001694}
1695
Victor Stinner96865452011-03-01 23:44:09 +00001696/* helper for PyUnicode_FromFormatV() */
1697
1698static const char*
1699parse_format_flags(const char *f,
1700 int *p_width, int *p_precision,
1701 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1702{
1703 int width, precision, longflag, longlongflag, size_tflag;
1704
1705 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1706 f++;
1707 width = 0;
1708 while (Py_ISDIGIT((unsigned)*f))
1709 width = (width*10) + *f++ - '0';
1710 precision = 0;
1711 if (*f == '.') {
1712 f++;
1713 while (Py_ISDIGIT((unsigned)*f))
1714 precision = (precision*10) + *f++ - '0';
1715 if (*f == '%') {
1716 /* "%.3%s" => f points to "3" */
1717 f--;
1718 }
1719 }
1720 if (*f == '\0') {
1721 /* bogus format "%.1" => go backward, f points to "1" */
1722 f--;
1723 }
1724 if (p_width != NULL)
1725 *p_width = width;
1726 if (p_precision != NULL)
1727 *p_precision = precision;
1728
1729 /* Handle %ld, %lu, %lld and %llu. */
1730 longflag = 0;
1731 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001732 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001733
1734 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001735 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001736 longflag = 1;
1737 ++f;
1738 }
1739#ifdef HAVE_LONG_LONG
1740 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001741 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001742 longlongflag = 1;
1743 f += 2;
1744 }
1745#endif
1746 }
1747 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001748 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001749 size_tflag = 1;
1750 ++f;
1751 }
1752 if (p_longflag != NULL)
1753 *p_longflag = longflag;
1754 if (p_longlongflag != NULL)
1755 *p_longlongflag = longlongflag;
1756 if (p_size_tflag != NULL)
1757 *p_size_tflag = size_tflag;
1758 return f;
1759}
1760
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001761/* maximum number of characters required for output of %ld. 21 characters
1762 allows for 64-bit integers (in decimal) and an optional sign. */
1763#define MAX_LONG_CHARS 21
1764/* maximum number of characters required for output of %lld.
1765 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1766 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1767#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1768
Walter Dörwaldd2034312007-05-18 16:29:38 +00001769PyObject *
1770PyUnicode_FromFormatV(const char *format, va_list vargs)
1771{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001772 va_list count;
1773 Py_ssize_t callcount = 0;
1774 PyObject **callresults = NULL;
1775 PyObject **callresult = NULL;
1776 Py_ssize_t n = 0;
1777 int width = 0;
1778 int precision = 0;
1779 int zeropad;
1780 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001782 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001783 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1785 Py_UCS4 argmaxchar;
1786 Py_ssize_t numbersize = 0;
1787 char *numberresults = NULL;
1788 char *numberresult = NULL;
1789 Py_ssize_t i;
1790 int kind;
1791 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001792
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001793 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001794 /* step 1: count the number of %S/%R/%A/%s format specifications
1795 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1796 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 * result in an array)
1798 * also esimate a upper bound for all the number formats in the string,
1799 * numbers will be formated in step 3 and be keept in a '\0'-separated
1800 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001801 for (f = format; *f; f++) {
1802 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001803 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1805 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1806 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1807 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001810#ifdef HAVE_LONG_LONG
1811 if (longlongflag) {
1812 if (width < MAX_LONG_LONG_CHARS)
1813 width = MAX_LONG_LONG_CHARS;
1814 }
1815 else
1816#endif
1817 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1818 including sign. Decimal takes the most space. This
1819 isn't enough for octal. If a width is specified we
1820 need more (which we allocate later). */
1821 if (width < MAX_LONG_CHARS)
1822 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823
1824 /* account for the size + '\0' to separate numbers
1825 inside of the numberresults buffer */
1826 numbersize += (width + 1);
1827 }
1828 }
1829 else if ((unsigned char)*f > 127) {
1830 PyErr_Format(PyExc_ValueError,
1831 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1832 "string, got a non-ASCII byte: 0x%02x",
1833 (unsigned char)*f);
1834 return NULL;
1835 }
1836 }
1837 /* step 2: allocate memory for the results of
1838 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1839 if (callcount) {
1840 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1841 if (!callresults) {
1842 PyErr_NoMemory();
1843 return NULL;
1844 }
1845 callresult = callresults;
1846 }
1847 /* step 2.5: allocate memory for the results of formating numbers */
1848 if (numbersize) {
1849 numberresults = PyObject_Malloc(numbersize);
1850 if (!numberresults) {
1851 PyErr_NoMemory();
1852 goto fail;
1853 }
1854 numberresult = numberresults;
1855 }
1856
1857 /* step 3: format numbers and figure out how large a buffer we need */
1858 for (f = format; *f; f++) {
1859 if (*f == '%') {
1860 const char* p;
1861 int longflag;
1862 int longlongflag;
1863 int size_tflag;
1864 int numprinted;
1865
1866 p = f;
1867 zeropad = (f[1] == '0');
1868 f = parse_format_flags(f, &width, &precision,
1869 &longflag, &longlongflag, &size_tflag);
1870 switch (*f) {
1871 case 'c':
1872 {
1873 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001874 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 n++;
1876 break;
1877 }
1878 case '%':
1879 n++;
1880 break;
1881 case 'i':
1882 case 'd':
1883 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1884 width, precision, *f);
1885 if (longflag)
1886 numprinted = sprintf(numberresult, fmt,
1887 va_arg(count, long));
1888#ifdef HAVE_LONG_LONG
1889 else if (longlongflag)
1890 numprinted = sprintf(numberresult, fmt,
1891 va_arg(count, PY_LONG_LONG));
1892#endif
1893 else if (size_tflag)
1894 numprinted = sprintf(numberresult, fmt,
1895 va_arg(count, Py_ssize_t));
1896 else
1897 numprinted = sprintf(numberresult, fmt,
1898 va_arg(count, int));
1899 n += numprinted;
1900 /* advance by +1 to skip over the '\0' */
1901 numberresult += (numprinted + 1);
1902 assert(*(numberresult - 1) == '\0');
1903 assert(*(numberresult - 2) != '\0');
1904 assert(numprinted >= 0);
1905 assert(numberresult <= numberresults + numbersize);
1906 break;
1907 case 'u':
1908 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1909 width, precision, 'u');
1910 if (longflag)
1911 numprinted = sprintf(numberresult, fmt,
1912 va_arg(count, unsigned long));
1913#ifdef HAVE_LONG_LONG
1914 else if (longlongflag)
1915 numprinted = sprintf(numberresult, fmt,
1916 va_arg(count, unsigned PY_LONG_LONG));
1917#endif
1918 else if (size_tflag)
1919 numprinted = sprintf(numberresult, fmt,
1920 va_arg(count, size_t));
1921 else
1922 numprinted = sprintf(numberresult, fmt,
1923 va_arg(count, unsigned int));
1924 n += numprinted;
1925 numberresult += (numprinted + 1);
1926 assert(*(numberresult - 1) == '\0');
1927 assert(*(numberresult - 2) != '\0');
1928 assert(numprinted >= 0);
1929 assert(numberresult <= numberresults + numbersize);
1930 break;
1931 case 'x':
1932 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1933 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1934 n += numprinted;
1935 numberresult += (numprinted + 1);
1936 assert(*(numberresult - 1) == '\0');
1937 assert(*(numberresult - 2) != '\0');
1938 assert(numprinted >= 0);
1939 assert(numberresult <= numberresults + numbersize);
1940 break;
1941 case 'p':
1942 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1943 /* %p is ill-defined: ensure leading 0x. */
1944 if (numberresult[1] == 'X')
1945 numberresult[1] = 'x';
1946 else if (numberresult[1] != 'x') {
1947 memmove(numberresult + 2, numberresult,
1948 strlen(numberresult) + 1);
1949 numberresult[0] = '0';
1950 numberresult[1] = 'x';
1951 numprinted += 2;
1952 }
1953 n += numprinted;
1954 numberresult += (numprinted + 1);
1955 assert(*(numberresult - 1) == '\0');
1956 assert(*(numberresult - 2) != '\0');
1957 assert(numprinted >= 0);
1958 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001959 break;
1960 case 's':
1961 {
1962 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001963 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001964 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1965 if (!str)
1966 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 /* since PyUnicode_DecodeUTF8 returns already flexible
1968 unicode objects, there is no need to call ready on them */
1969 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001970 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001972 /* Remember the str and switch to the next slot */
1973 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001974 break;
1975 }
1976 case 'U':
1977 {
1978 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001979 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 if (PyUnicode_READY(obj) == -1)
1981 goto fail;
1982 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001983 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001985 break;
1986 }
1987 case 'V':
1988 {
1989 PyObject *obj = va_arg(count, PyObject *);
1990 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001991 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001992 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02001993 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001994 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 if (PyUnicode_READY(obj) == -1)
1996 goto fail;
1997 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001998 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002000 *callresult++ = NULL;
2001 }
2002 else {
2003 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2004 if (!str_obj)
2005 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002007 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002009 *callresult++ = str_obj;
2010 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002011 break;
2012 }
2013 case 'S':
2014 {
2015 PyObject *obj = va_arg(count, PyObject *);
2016 PyObject *str;
2017 assert(obj);
2018 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002020 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002022 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002024 /* Remember the str and switch to the next slot */
2025 *callresult++ = str;
2026 break;
2027 }
2028 case 'R':
2029 {
2030 PyObject *obj = va_arg(count, PyObject *);
2031 PyObject *repr;
2032 assert(obj);
2033 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002035 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002037 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002039 /* Remember the repr and switch to the next slot */
2040 *callresult++ = repr;
2041 break;
2042 }
2043 case 'A':
2044 {
2045 PyObject *obj = va_arg(count, PyObject *);
2046 PyObject *ascii;
2047 assert(obj);
2048 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002050 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002052 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002054 /* Remember the repr and switch to the next slot */
2055 *callresult++ = ascii;
2056 break;
2057 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 default:
2059 /* if we stumble upon an unknown
2060 formatting code, copy the rest of
2061 the format string to the output
2062 string. (we cannot just skip the
2063 code, since there's no way to know
2064 what's in the argument list) */
2065 n += strlen(p);
2066 goto expand;
2067 }
2068 } else
2069 n++;
2070 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002072 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002073 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002074 we don't have to resize the string.
2075 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002077 if (!string)
2078 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079 kind = PyUnicode_KIND(string);
2080 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002085 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002086 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002087
2088 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2090 /* checking for == because the last argument could be a empty
2091 string, which causes i to point to end, the assert at the end of
2092 the loop */
2093 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002094
Benjamin Peterson14339b62009-01-31 16:36:08 +00002095 switch (*f) {
2096 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002097 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 const int ordinal = va_arg(vargs, int);
2099 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002100 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002101 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002102 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002104 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002105 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 case 'p':
2107 /* unused, since we already have the result */
2108 if (*f == 'p')
2109 (void) va_arg(vargs, void *);
2110 else
2111 (void) va_arg(vargs, int);
2112 /* extract the result from numberresults and append. */
2113 for (; *numberresult; ++i, ++numberresult)
2114 PyUnicode_WRITE(kind, data, i, *numberresult);
2115 /* skip over the separating '\0' */
2116 assert(*numberresult == '\0');
2117 numberresult++;
2118 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002119 break;
2120 case 's':
2121 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002122 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002124 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 size = PyUnicode_GET_LENGTH(*callresult);
2126 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002127 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2128 *callresult, 0,
2129 size) < 0)
2130 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002132 /* We're done with the unicode()/repr() => forget it */
2133 Py_DECREF(*callresult);
2134 /* switch to next unicode()/repr() result */
2135 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002136 break;
2137 }
2138 case 'U':
2139 {
2140 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141 Py_ssize_t size;
2142 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2143 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002144 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2145 obj, 0,
2146 size) < 0)
2147 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002149 break;
2150 }
2151 case 'V':
2152 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002155 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 size = PyUnicode_GET_LENGTH(obj);
2158 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002159 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2160 obj, 0,
2161 size) < 0)
2162 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002164 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 size = PyUnicode_GET_LENGTH(*callresult);
2166 assert(PyUnicode_KIND(*callresult) <=
2167 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002168 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2169 *callresult,
2170 0, size) < 0)
2171 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002173 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002175 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002176 break;
2177 }
2178 case 'S':
2179 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002180 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002181 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002182 /* unused, since we already have the result */
2183 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002185 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2186 *callresult, 0,
2187 PyUnicode_GET_LENGTH(*callresult)) < 0)
2188 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002190 /* We're done with the unicode()/repr() => forget it */
2191 Py_DECREF(*callresult);
2192 /* switch to next unicode()/repr() result */
2193 ++callresult;
2194 break;
2195 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002196 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 break;
2199 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 for (; *p; ++p, ++i)
2201 PyUnicode_WRITE(kind, data, i, *p);
2202 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 goto end;
2204 }
Victor Stinner1205f272010-09-11 00:54:47 +00002205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 else {
2207 assert(i < PyUnicode_GET_LENGTH(string));
2208 PyUnicode_WRITE(kind, data, i++, *f);
2209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002212
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 if (callresults)
2215 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (numberresults)
2217 PyObject_Free(numberresults);
2218 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002219 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 if (callresults) {
2221 PyObject **callresult2 = callresults;
2222 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002223 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 ++callresult2;
2225 }
2226 PyObject_Free(callresults);
2227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 if (numberresults)
2229 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002230 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231}
2232
Walter Dörwaldd2034312007-05-18 16:29:38 +00002233PyObject *
2234PyUnicode_FromFormat(const char *format, ...)
2235{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 PyObject* ret;
2237 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002238
2239#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002241#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002242 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002243#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 ret = PyUnicode_FromFormatV(format, vargs);
2245 va_end(vargs);
2246 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002247}
2248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249#ifdef HAVE_WCHAR_H
2250
Victor Stinner5593d8a2010-10-02 11:11:27 +00002251/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2252 convert a Unicode object to a wide character string.
2253
Victor Stinnerd88d9832011-09-06 02:00:05 +02002254 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002255 character) required to convert the unicode object. Ignore size argument.
2256
Victor Stinnerd88d9832011-09-06 02:00:05 +02002257 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002258 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002259 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002260static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002261unicode_aswidechar(PyUnicodeObject *unicode,
2262 wchar_t *w,
2263 Py_ssize_t size)
2264{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002265 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 const wchar_t *wstr;
2267
2268 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2269 if (wstr == NULL)
2270 return -1;
2271
Victor Stinner5593d8a2010-10-02 11:11:27 +00002272 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002273 if (size > res)
2274 size = res + 1;
2275 else
2276 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002278 return res;
2279 }
2280 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002282}
2283
2284Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002285PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002286 wchar_t *w,
2287 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002288{
2289 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002290 PyErr_BadInternalCall();
2291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002293 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002294}
2295
Victor Stinner137c34c2010-09-29 10:25:54 +00002296wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002297PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002298 Py_ssize_t *size)
2299{
2300 wchar_t* buffer;
2301 Py_ssize_t buflen;
2302
2303 if (unicode == NULL) {
2304 PyErr_BadInternalCall();
2305 return NULL;
2306 }
2307
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002308 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 if (buflen == -1)
2310 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002311 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002312 PyErr_NoMemory();
2313 return NULL;
2314 }
2315
Victor Stinner137c34c2010-09-29 10:25:54 +00002316 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2317 if (buffer == NULL) {
2318 PyErr_NoMemory();
2319 return NULL;
2320 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002321 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 if (buflen == -1)
2323 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002324 if (size != NULL)
2325 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002326 return buffer;
2327}
2328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002330
Alexander Belopolsky40018472011-02-26 01:02:56 +00002331PyObject *
2332PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002335 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002336 PyErr_SetString(PyExc_ValueError,
2337 "chr() arg not in range(0x110000)");
2338 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002339 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 if (ordinal < 256)
2342 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344 v = PyUnicode_New(1, ordinal);
2345 if (v == NULL)
2346 return NULL;
2347 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2348 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002349}
2350
Alexander Belopolsky40018472011-02-26 01:02:56 +00002351PyObject *
2352PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002354 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002355 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002356 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002357 if (PyUnicode_READY(obj))
2358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002359 Py_INCREF(obj);
2360 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002361 }
2362 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002363 /* For a Unicode subtype that's not a Unicode object,
2364 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002365 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002366 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002367 PyErr_Format(PyExc_TypeError,
2368 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002369 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002370 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002371}
2372
Alexander Belopolsky40018472011-02-26 01:02:56 +00002373PyObject *
2374PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002375 const char *encoding,
2376 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002377{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002378 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002379 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002380
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 PyErr_BadInternalCall();
2383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002385
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002386 /* Decoding bytes objects is the most common case and should be fast */
2387 if (PyBytes_Check(obj)) {
2388 if (PyBytes_GET_SIZE(obj) == 0) {
2389 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002390 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002391 }
2392 else {
2393 v = PyUnicode_Decode(
2394 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2395 encoding, errors);
2396 }
2397 return v;
2398 }
2399
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002400 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002401 PyErr_SetString(PyExc_TypeError,
2402 "decoding str is not supported");
2403 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002404 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002405
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002406 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2407 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2408 PyErr_Format(PyExc_TypeError,
2409 "coercing to str: need bytes, bytearray "
2410 "or buffer-like object, %.80s found",
2411 Py_TYPE(obj)->tp_name);
2412 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002413 }
Tim Petersced69f82003-09-16 20:30:58 +00002414
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002415 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002417 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 }
Tim Petersced69f82003-09-16 20:30:58 +00002419 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002420 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002421
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002422 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002423 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424}
2425
Victor Stinner600d3be2010-06-10 12:00:55 +00002426/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002427 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2428 1 on success. */
2429static int
2430normalize_encoding(const char *encoding,
2431 char *lower,
2432 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002433{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002434 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002435 char *l;
2436 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002437
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002438 e = encoding;
2439 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002440 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002441 while (*e) {
2442 if (l == l_end)
2443 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002444 if (Py_ISUPPER(*e)) {
2445 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002446 }
2447 else if (*e == '_') {
2448 *l++ = '-';
2449 e++;
2450 }
2451 else {
2452 *l++ = *e++;
2453 }
2454 }
2455 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002456 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002457}
2458
Alexander Belopolsky40018472011-02-26 01:02:56 +00002459PyObject *
2460PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002461 Py_ssize_t size,
2462 const char *encoding,
2463 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002464{
2465 PyObject *buffer = NULL, *unicode;
2466 Py_buffer info;
2467 char lower[11]; /* Enough for any encoding shortcut */
2468
2469 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002470 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002471
2472 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002473 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002474 if ((strcmp(lower, "utf-8") == 0) ||
2475 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002476 return PyUnicode_DecodeUTF8(s, size, errors);
2477 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002478 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002479 (strcmp(lower, "iso-8859-1") == 0))
2480 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002481#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002482 else if (strcmp(lower, "mbcs") == 0)
2483 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002484#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002485 else if (strcmp(lower, "ascii") == 0)
2486 return PyUnicode_DecodeASCII(s, size, errors);
2487 else if (strcmp(lower, "utf-16") == 0)
2488 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2489 else if (strcmp(lower, "utf-32") == 0)
2490 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492
2493 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002494 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002495 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002496 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002497 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498 if (buffer == NULL)
2499 goto onError;
2500 unicode = PyCodec_Decode(buffer, encoding, errors);
2501 if (unicode == NULL)
2502 goto onError;
2503 if (!PyUnicode_Check(unicode)) {
2504 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002505 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002506 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 Py_DECREF(unicode);
2508 goto onError;
2509 }
2510 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 if (PyUnicode_READY(unicode)) {
2512 Py_DECREF(unicode);
2513 return NULL;
2514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002516
Benjamin Peterson29060642009-01-31 22:14:21 +00002517 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002518 Py_XDECREF(buffer);
2519 return NULL;
2520}
2521
Alexander Belopolsky40018472011-02-26 01:02:56 +00002522PyObject *
2523PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002524 const char *encoding,
2525 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002526{
2527 PyObject *v;
2528
2529 if (!PyUnicode_Check(unicode)) {
2530 PyErr_BadArgument();
2531 goto onError;
2532 }
2533
2534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002536
2537 /* Decode via the codec registry */
2538 v = PyCodec_Decode(unicode, encoding, errors);
2539 if (v == NULL)
2540 goto onError;
2541 return v;
2542
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002544 return NULL;
2545}
2546
Alexander Belopolsky40018472011-02-26 01:02:56 +00002547PyObject *
2548PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002549 const char *encoding,
2550 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002551{
2552 PyObject *v;
2553
2554 if (!PyUnicode_Check(unicode)) {
2555 PyErr_BadArgument();
2556 goto onError;
2557 }
2558
2559 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002560 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002561
2562 /* Decode via the codec registry */
2563 v = PyCodec_Decode(unicode, encoding, errors);
2564 if (v == NULL)
2565 goto onError;
2566 if (!PyUnicode_Check(v)) {
2567 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002568 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002569 Py_TYPE(v)->tp_name);
2570 Py_DECREF(v);
2571 goto onError;
2572 }
2573 return v;
2574
Benjamin Peterson29060642009-01-31 22:14:21 +00002575 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002576 return NULL;
2577}
2578
Alexander Belopolsky40018472011-02-26 01:02:56 +00002579PyObject *
2580PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002581 Py_ssize_t size,
2582 const char *encoding,
2583 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584{
2585 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002586
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 unicode = PyUnicode_FromUnicode(s, size);
2588 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2591 Py_DECREF(unicode);
2592 return v;
2593}
2594
Alexander Belopolsky40018472011-02-26 01:02:56 +00002595PyObject *
2596PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002597 const char *encoding,
2598 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002599{
2600 PyObject *v;
2601
2602 if (!PyUnicode_Check(unicode)) {
2603 PyErr_BadArgument();
2604 goto onError;
2605 }
2606
2607 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002608 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002609
2610 /* Encode via the codec registry */
2611 v = PyCodec_Encode(unicode, encoding, errors);
2612 if (v == NULL)
2613 goto onError;
2614 return v;
2615
Benjamin Peterson29060642009-01-31 22:14:21 +00002616 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002617 return NULL;
2618}
2619
Victor Stinnerad158722010-10-27 00:25:46 +00002620PyObject *
2621PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002622{
Victor Stinner99b95382011-07-04 14:23:54 +02002623#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002624 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2625 PyUnicode_GET_SIZE(unicode),
2626 NULL);
2627#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002629#else
Victor Stinner793b5312011-04-27 00:24:21 +02002630 PyInterpreterState *interp = PyThreadState_GET()->interp;
2631 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2632 cannot use it to encode and decode filenames before it is loaded. Load
2633 the Python codec requires to encode at least its own filename. Use the C
2634 version of the locale codec until the codec registry is initialized and
2635 the Python codec is loaded.
2636
2637 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2638 cannot only rely on it: check also interp->fscodec_initialized for
2639 subinterpreters. */
2640 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002641 return PyUnicode_AsEncodedString(unicode,
2642 Py_FileSystemDefaultEncoding,
2643 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002644 }
2645 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002646 /* locale encoding with surrogateescape */
2647 wchar_t *wchar;
2648 char *bytes;
2649 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002650 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002651
2652 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2653 if (wchar == NULL)
2654 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002655 bytes = _Py_wchar2char(wchar, &error_pos);
2656 if (bytes == NULL) {
2657 if (error_pos != (size_t)-1) {
2658 char *errmsg = strerror(errno);
2659 PyObject *exc = NULL;
2660 if (errmsg == NULL)
2661 errmsg = "Py_wchar2char() failed";
2662 raise_encode_exception(&exc,
2663 "filesystemencoding",
2664 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2665 error_pos, error_pos+1,
2666 errmsg);
2667 Py_XDECREF(exc);
2668 }
2669 else
2670 PyErr_NoMemory();
2671 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002672 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002673 }
2674 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002675
2676 bytes_obj = PyBytes_FromString(bytes);
2677 PyMem_Free(bytes);
2678 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002679 }
Victor Stinnerad158722010-10-27 00:25:46 +00002680#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002681}
2682
Alexander Belopolsky40018472011-02-26 01:02:56 +00002683PyObject *
2684PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002685 const char *encoding,
2686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687{
2688 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002689 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002690
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 if (!PyUnicode_Check(unicode)) {
2692 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002693 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
Fred Drakee4315f52000-05-09 19:53:39 +00002695
Victor Stinner2f283c22011-03-02 01:21:46 +00002696 if (encoding == NULL) {
2697 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002699 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002701 }
Fred Drakee4315f52000-05-09 19:53:39 +00002702
2703 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002704 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002705 if ((strcmp(lower, "utf-8") == 0) ||
2706 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002707 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002708 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002710 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002712 }
Victor Stinner37296e82010-06-10 13:36:23 +00002713 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002714 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002715 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002717#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002718 else if (strcmp(lower, "mbcs") == 0)
2719 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2720 PyUnicode_GET_SIZE(unicode),
2721 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002722#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002723 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002725 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726
2727 /* Encode via the codec registry */
2728 v = PyCodec_Encode(unicode, encoding, errors);
2729 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002730 return NULL;
2731
2732 /* The normal path */
2733 if (PyBytes_Check(v))
2734 return v;
2735
2736 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002737 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002738 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002739 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002740
2741 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2742 "encoder %s returned bytearray instead of bytes",
2743 encoding);
2744 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002745 Py_DECREF(v);
2746 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002747 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002748
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002749 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2750 Py_DECREF(v);
2751 return b;
2752 }
2753
2754 PyErr_Format(PyExc_TypeError,
2755 "encoder did not return a bytes object (type=%.400s)",
2756 Py_TYPE(v)->tp_name);
2757 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002758 return NULL;
2759}
2760
Alexander Belopolsky40018472011-02-26 01:02:56 +00002761PyObject *
2762PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002763 const char *encoding,
2764 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002765{
2766 PyObject *v;
2767
2768 if (!PyUnicode_Check(unicode)) {
2769 PyErr_BadArgument();
2770 goto onError;
2771 }
2772
2773 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002774 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002775
2776 /* Encode via the codec registry */
2777 v = PyCodec_Encode(unicode, encoding, errors);
2778 if (v == NULL)
2779 goto onError;
2780 if (!PyUnicode_Check(v)) {
2781 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002782 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002783 Py_TYPE(v)->tp_name);
2784 Py_DECREF(v);
2785 goto onError;
2786 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002788
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return NULL;
2791}
2792
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002793PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002794PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002795 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002796 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2797}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002798
Christian Heimes5894ba72007-11-04 11:43:14 +00002799PyObject*
2800PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2801{
Victor Stinner99b95382011-07-04 14:23:54 +02002802#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002803 return PyUnicode_DecodeMBCS(s, size, NULL);
2804#elif defined(__APPLE__)
2805 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2806#else
Victor Stinner793b5312011-04-27 00:24:21 +02002807 PyInterpreterState *interp = PyThreadState_GET()->interp;
2808 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2809 cannot use it to encode and decode filenames before it is loaded. Load
2810 the Python codec requires to encode at least its own filename. Use the C
2811 version of the locale codec until the codec registry is initialized and
2812 the Python codec is loaded.
2813
2814 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2815 cannot only rely on it: check also interp->fscodec_initialized for
2816 subinterpreters. */
2817 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002818 return PyUnicode_Decode(s, size,
2819 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002820 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002821 }
2822 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002823 /* locale encoding with surrogateescape */
2824 wchar_t *wchar;
2825 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002826 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002827
2828 if (s[size] != '\0' || size != strlen(s)) {
2829 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2830 return NULL;
2831 }
2832
Victor Stinner168e1172010-10-16 23:16:16 +00002833 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002834 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002835 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002836
Victor Stinner168e1172010-10-16 23:16:16 +00002837 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002838 PyMem_Free(wchar);
2839 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002840 }
Victor Stinnerad158722010-10-27 00:25:46 +00002841#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002842}
2843
Martin v. Löwis011e8422009-05-05 04:43:17 +00002844
2845int
2846PyUnicode_FSConverter(PyObject* arg, void* addr)
2847{
2848 PyObject *output = NULL;
2849 Py_ssize_t size;
2850 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002851 if (arg == NULL) {
2852 Py_DECREF(*(PyObject**)addr);
2853 return 1;
2854 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002855 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002856 output = arg;
2857 Py_INCREF(output);
2858 }
2859 else {
2860 arg = PyUnicode_FromObject(arg);
2861 if (!arg)
2862 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002863 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002864 Py_DECREF(arg);
2865 if (!output)
2866 return 0;
2867 if (!PyBytes_Check(output)) {
2868 Py_DECREF(output);
2869 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2870 return 0;
2871 }
2872 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002873 size = PyBytes_GET_SIZE(output);
2874 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002875 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002876 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002877 Py_DECREF(output);
2878 return 0;
2879 }
2880 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002881 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002882}
2883
2884
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002885int
2886PyUnicode_FSDecoder(PyObject* arg, void* addr)
2887{
2888 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002889 if (arg == NULL) {
2890 Py_DECREF(*(PyObject**)addr);
2891 return 1;
2892 }
2893 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 if (PyUnicode_READY(arg))
2895 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002896 output = arg;
2897 Py_INCREF(output);
2898 }
2899 else {
2900 arg = PyBytes_FromObject(arg);
2901 if (!arg)
2902 return 0;
2903 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2904 PyBytes_GET_SIZE(arg));
2905 Py_DECREF(arg);
2906 if (!output)
2907 return 0;
2908 if (!PyUnicode_Check(output)) {
2909 Py_DECREF(output);
2910 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2911 return 0;
2912 }
2913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002914 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2915 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002916 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2917 Py_DECREF(output);
2918 return 0;
2919 }
2920 *(PyObject**)addr = output;
2921 return Py_CLEANUP_SUPPORTED;
2922}
2923
2924
Martin v. Löwis5b222132007-06-10 09:51:05 +00002925char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002926PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002927{
Christian Heimesf3863112007-11-22 07:46:41 +00002928 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002929 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2930
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002931 if (!PyUnicode_Check(unicode)) {
2932 PyErr_BadArgument();
2933 return NULL;
2934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002935 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002936 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002938 if (PyUnicode_UTF8(unicode) == NULL) {
2939 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2941 if (bytes == NULL)
2942 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002943 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2944 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945 Py_DECREF(bytes);
2946 return NULL;
2947 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002948 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2949 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002950 Py_DECREF(bytes);
2951 }
2952
2953 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002954 *psize = PyUnicode_UTF8_LENGTH(unicode);
2955 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002956}
2957
2958char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2962}
2963
2964#ifdef Py_DEBUG
2965int unicode_as_unicode_calls = 0;
2966#endif
2967
2968
2969Py_UNICODE *
2970PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2971{
2972 PyUnicodeObject *u;
2973 const unsigned char *one_byte;
2974#if SIZEOF_WCHAR_T == 4
2975 const Py_UCS2 *two_bytes;
2976#else
2977 const Py_UCS4 *four_bytes;
2978 const Py_UCS4 *ucs4_end;
2979 Py_ssize_t num_surrogates;
2980#endif
2981 wchar_t *w;
2982 wchar_t *wchar_end;
2983
2984 if (!PyUnicode_Check(unicode)) {
2985 PyErr_BadArgument();
2986 return NULL;
2987 }
2988 u = (PyUnicodeObject*)unicode;
2989 if (_PyUnicode_WSTR(u) == NULL) {
2990 /* Non-ASCII compact unicode object */
2991 assert(_PyUnicode_KIND(u) != 0);
2992 assert(PyUnicode_IS_READY(u));
2993
2994#ifdef Py_DEBUG
2995 ++unicode_as_unicode_calls;
2996#endif
2997
2998 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2999#if SIZEOF_WCHAR_T == 2
3000 four_bytes = PyUnicode_4BYTE_DATA(u);
3001 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3002 num_surrogates = 0;
3003
3004 for (; four_bytes < ucs4_end; ++four_bytes) {
3005 if (*four_bytes > 0xFFFF)
3006 ++num_surrogates;
3007 }
3008
3009 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3010 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3011 if (!_PyUnicode_WSTR(u)) {
3012 PyErr_NoMemory();
3013 return NULL;
3014 }
3015 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3016
3017 w = _PyUnicode_WSTR(u);
3018 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3019 four_bytes = PyUnicode_4BYTE_DATA(u);
3020 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3021 if (*four_bytes > 0xFFFF) {
3022 /* encode surrogate pair in this case */
3023 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3024 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3025 }
3026 else
3027 *w = *four_bytes;
3028
3029 if (w > wchar_end) {
3030 assert(0 && "Miscalculated string end");
3031 }
3032 }
3033 *w = 0;
3034#else
3035 /* sizeof(wchar_t) == 4 */
3036 Py_FatalError("Impossible unicode object state, wstr and str "
3037 "should share memory already.");
3038 return NULL;
3039#endif
3040 }
3041 else {
3042 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3043 (_PyUnicode_LENGTH(u) + 1));
3044 if (!_PyUnicode_WSTR(u)) {
3045 PyErr_NoMemory();
3046 return NULL;
3047 }
3048 if (!PyUnicode_IS_COMPACT_ASCII(u))
3049 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3050 w = _PyUnicode_WSTR(u);
3051 wchar_end = w + _PyUnicode_LENGTH(u);
3052
3053 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3054 one_byte = PyUnicode_1BYTE_DATA(u);
3055 for (; w < wchar_end; ++one_byte, ++w)
3056 *w = *one_byte;
3057 /* null-terminate the wstr */
3058 *w = 0;
3059 }
3060 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3061#if SIZEOF_WCHAR_T == 4
3062 two_bytes = PyUnicode_2BYTE_DATA(u);
3063 for (; w < wchar_end; ++two_bytes, ++w)
3064 *w = *two_bytes;
3065 /* null-terminate the wstr */
3066 *w = 0;
3067#else
3068 /* sizeof(wchar_t) == 2 */
3069 PyObject_FREE(_PyUnicode_WSTR(u));
3070 _PyUnicode_WSTR(u) = NULL;
3071 Py_FatalError("Impossible unicode object state, wstr "
3072 "and str should share memory already.");
3073 return NULL;
3074#endif
3075 }
3076 else {
3077 assert(0 && "This should never happen.");
3078 }
3079 }
3080 }
3081 if (size != NULL)
3082 *size = PyUnicode_WSTR_LENGTH(u);
3083 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003084}
3085
Alexander Belopolsky40018472011-02-26 01:02:56 +00003086Py_UNICODE *
3087PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003089 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090}
3091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092
Alexander Belopolsky40018472011-02-26 01:02:56 +00003093Py_ssize_t
3094PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095{
3096 if (!PyUnicode_Check(unicode)) {
3097 PyErr_BadArgument();
3098 goto onError;
3099 }
3100 return PyUnicode_GET_SIZE(unicode);
3101
Benjamin Peterson29060642009-01-31 22:14:21 +00003102 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 return -1;
3104}
3105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003106Py_ssize_t
3107PyUnicode_GetLength(PyObject *unicode)
3108{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003109 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 PyErr_BadArgument();
3111 return -1;
3112 }
3113
3114 return PyUnicode_GET_LENGTH(unicode);
3115}
3116
3117Py_UCS4
3118PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3119{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003120 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3121 PyErr_BadArgument();
3122 return (Py_UCS4)-1;
3123 }
3124 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3125 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003126 return (Py_UCS4)-1;
3127 }
3128 return PyUnicode_READ_CHAR(unicode, index);
3129}
3130
3131int
3132PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3133{
3134 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003135 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003136 return -1;
3137 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003138 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3139 PyErr_SetString(PyExc_IndexError, "string index out of range");
3140 return -1;
3141 }
3142 if (_PyUnicode_Dirty(unicode))
3143 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3145 index, ch);
3146 return 0;
3147}
3148
Alexander Belopolsky40018472011-02-26 01:02:56 +00003149const char *
3150PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003151{
Victor Stinner42cb4622010-09-01 19:39:01 +00003152 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003153}
3154
Victor Stinner554f3f02010-06-16 23:33:54 +00003155/* create or adjust a UnicodeDecodeError */
3156static void
3157make_decode_exception(PyObject **exceptionObject,
3158 const char *encoding,
3159 const char *input, Py_ssize_t length,
3160 Py_ssize_t startpos, Py_ssize_t endpos,
3161 const char *reason)
3162{
3163 if (*exceptionObject == NULL) {
3164 *exceptionObject = PyUnicodeDecodeError_Create(
3165 encoding, input, length, startpos, endpos, reason);
3166 }
3167 else {
3168 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3169 goto onError;
3170 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3171 goto onError;
3172 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3173 goto onError;
3174 }
3175 return;
3176
3177onError:
3178 Py_DECREF(*exceptionObject);
3179 *exceptionObject = NULL;
3180}
3181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003182/* error handling callback helper:
3183 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003184 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 and adjust various state variables.
3186 return 0 on success, -1 on error
3187*/
3188
Alexander Belopolsky40018472011-02-26 01:02:56 +00003189static int
3190unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003191 const char *encoding, const char *reason,
3192 const char **input, const char **inend, Py_ssize_t *startinpos,
3193 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3194 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003195{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003196 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003197
3198 PyObject *restuple = NULL;
3199 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003200 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003201 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003202 Py_ssize_t requiredsize;
3203 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003204 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003205 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003206 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 int res = -1;
3208
3209 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 *errorHandler = PyCodec_LookupError(errors);
3211 if (*errorHandler == NULL)
3212 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213 }
3214
Victor Stinner554f3f02010-06-16 23:33:54 +00003215 make_decode_exception(exceptionObject,
3216 encoding,
3217 *input, *inend - *input,
3218 *startinpos, *endinpos,
3219 reason);
3220 if (*exceptionObject == NULL)
3221 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003222
3223 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3224 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003227 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003229 }
3230 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003232
3233 /* Copy back the bytes variables, which might have been modified by the
3234 callback */
3235 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3236 if (!inputobj)
3237 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003238 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003239 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003240 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003241 *input = PyBytes_AS_STRING(inputobj);
3242 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003243 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003244 /* we can DECREF safely, as the exception has another reference,
3245 so the object won't go away. */
3246 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003250 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3252 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003253 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254
3255 /* need more space? (at least enough for what we
3256 have+the replacement+the rest of the string (starting
3257 at the new input position), so we won't have to check space
3258 when there are no errors in the rest of the string) */
3259 repptr = PyUnicode_AS_UNICODE(repunicode);
3260 repsize = PyUnicode_GET_SIZE(repunicode);
3261 requiredsize = *outpos + repsize + insize-newpos;
3262 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 if (requiredsize<2*outsize)
3264 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003265 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003266 goto onError;
3267 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 }
3269 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003270 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_UNICODE_COPY(*outptr, repptr, repsize);
3272 *outptr += repsize;
3273 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 /* we made it! */
3276 res = 0;
3277
Benjamin Peterson29060642009-01-31 22:14:21 +00003278 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279 Py_XDECREF(restuple);
3280 return res;
3281}
3282
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003283/* --- UTF-7 Codec -------------------------------------------------------- */
3284
Antoine Pitrou244651a2009-05-04 18:56:13 +00003285/* See RFC2152 for details. We encode conservatively and decode liberally. */
3286
3287/* Three simple macros defining base-64. */
3288
3289/* Is c a base-64 character? */
3290
3291#define IS_BASE64(c) \
3292 (((c) >= 'A' && (c) <= 'Z') || \
3293 ((c) >= 'a' && (c) <= 'z') || \
3294 ((c) >= '0' && (c) <= '9') || \
3295 (c) == '+' || (c) == '/')
3296
3297/* given that c is a base-64 character, what is its base-64 value? */
3298
3299#define FROM_BASE64(c) \
3300 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3301 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3302 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3303 (c) == '+' ? 62 : 63)
3304
3305/* What is the base-64 character of the bottom 6 bits of n? */
3306
3307#define TO_BASE64(n) \
3308 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3309
3310/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3311 * decoded as itself. We are permissive on decoding; the only ASCII
3312 * byte not decoding to itself is the + which begins a base64
3313 * string. */
3314
3315#define DECODE_DIRECT(c) \
3316 ((c) <= 127 && (c) != '+')
3317
3318/* The UTF-7 encoder treats ASCII characters differently according to
3319 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3320 * the above). See RFC2152. This array identifies these different
3321 * sets:
3322 * 0 : "Set D"
3323 * alphanumeric and '(),-./:?
3324 * 1 : "Set O"
3325 * !"#$%&*;<=>@[]^_`{|}
3326 * 2 : "whitespace"
3327 * ht nl cr sp
3328 * 3 : special (must be base64 encoded)
3329 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3330 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003331
Tim Petersced69f82003-09-16 20:30:58 +00003332static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003333char utf7_category[128] = {
3334/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3335 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3336/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3337 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3338/* sp ! " # $ % & ' ( ) * + , - . / */
3339 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3340/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3341 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3342/* @ A B C D E F G H I J K L M N O */
3343 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3344/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3346/* ` a b c d e f g h i j k l m n o */
3347 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3348/* p q r s t u v w x y z { | } ~ del */
3349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003350};
3351
Antoine Pitrou244651a2009-05-04 18:56:13 +00003352/* ENCODE_DIRECT: this character should be encoded as itself. The
3353 * answer depends on whether we are encoding set O as itself, and also
3354 * on whether we are encoding whitespace as itself. RFC2152 makes it
3355 * clear that the answers to these questions vary between
3356 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003357
Antoine Pitrou244651a2009-05-04 18:56:13 +00003358#define ENCODE_DIRECT(c, directO, directWS) \
3359 ((c) < 128 && (c) > 0 && \
3360 ((utf7_category[(c)] == 0) || \
3361 (directWS && (utf7_category[(c)] == 2)) || \
3362 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003363
Alexander Belopolsky40018472011-02-26 01:02:56 +00003364PyObject *
3365PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003366 Py_ssize_t size,
3367 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003368{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003369 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3370}
3371
Antoine Pitrou244651a2009-05-04 18:56:13 +00003372/* The decoder. The only state we preserve is our read position,
3373 * i.e. how many characters we have consumed. So if we end in the
3374 * middle of a shift sequence we have to back off the read position
3375 * and the output to the beginning of the sequence, otherwise we lose
3376 * all the shift state (seen bits, number of bits seen, high
3377 * surrogate). */
3378
Alexander Belopolsky40018472011-02-26 01:02:56 +00003379PyObject *
3380PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003381 Py_ssize_t size,
3382 const char *errors,
3383 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003386 Py_ssize_t startinpos;
3387 Py_ssize_t endinpos;
3388 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003389 const char *e;
3390 PyUnicodeObject *unicode;
3391 Py_UNICODE *p;
3392 const char *errmsg = "";
3393 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003394 Py_UNICODE *shiftOutStart;
3395 unsigned int base64bits = 0;
3396 unsigned long base64buffer = 0;
3397 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 PyObject *errorHandler = NULL;
3399 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003400
3401 unicode = _PyUnicode_New(size);
3402 if (!unicode)
3403 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003404 if (size == 0) {
3405 if (consumed)
3406 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003407 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003408 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003411 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003412 e = s + size;
3413
3414 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003416 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003417 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003418
Antoine Pitrou244651a2009-05-04 18:56:13 +00003419 if (inShift) { /* in a base-64 section */
3420 if (IS_BASE64(ch)) { /* consume a base-64 character */
3421 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3422 base64bits += 6;
3423 s++;
3424 if (base64bits >= 16) {
3425 /* we have enough bits for a UTF-16 value */
3426 Py_UNICODE outCh = (Py_UNICODE)
3427 (base64buffer >> (base64bits-16));
3428 base64bits -= 16;
3429 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3430 if (surrogate) {
3431 /* expecting a second surrogate */
3432 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3433#ifdef Py_UNICODE_WIDE
3434 *p++ = (((surrogate & 0x3FF)<<10)
3435 | (outCh & 0x3FF)) + 0x10000;
3436#else
3437 *p++ = surrogate;
3438 *p++ = outCh;
3439#endif
3440 surrogate = 0;
3441 }
3442 else {
3443 surrogate = 0;
3444 errmsg = "second surrogate missing";
3445 goto utf7Error;
3446 }
3447 }
3448 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3449 /* first surrogate */
3450 surrogate = outCh;
3451 }
3452 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3453 errmsg = "unexpected second surrogate";
3454 goto utf7Error;
3455 }
3456 else {
3457 *p++ = outCh;
3458 }
3459 }
3460 }
3461 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003462 inShift = 0;
3463 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003464 if (surrogate) {
3465 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003466 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003467 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003468 if (base64bits > 0) { /* left-over bits */
3469 if (base64bits >= 6) {
3470 /* We've seen at least one base-64 character */
3471 errmsg = "partial character in shift sequence";
3472 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003474 else {
3475 /* Some bits remain; they should be zero */
3476 if (base64buffer != 0) {
3477 errmsg = "non-zero padding bits in shift sequence";
3478 goto utf7Error;
3479 }
3480 }
3481 }
3482 if (ch != '-') {
3483 /* '-' is absorbed; other terminating
3484 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003485 *p++ = ch;
3486 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003487 }
3488 }
3489 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003491 s++; /* consume '+' */
3492 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003493 s++;
3494 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003495 }
3496 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003497 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003498 shiftOutStart = p;
3499 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003500 }
3501 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003502 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003503 *p++ = ch;
3504 s++;
3505 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003506 else {
3507 startinpos = s-starts;
3508 s++;
3509 errmsg = "unexpected special character";
3510 goto utf7Error;
3511 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003512 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003513utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003514 outpos = p-PyUnicode_AS_UNICODE(unicode);
3515 endinpos = s-starts;
3516 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 errors, &errorHandler,
3518 "utf7", errmsg,
3519 &starts, &e, &startinpos, &endinpos, &exc, &s,
3520 &unicode, &outpos, &p))
3521 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003522 }
3523
Antoine Pitrou244651a2009-05-04 18:56:13 +00003524 /* end of string */
3525
3526 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3527 /* if we're in an inconsistent state, that's an error */
3528 if (surrogate ||
3529 (base64bits >= 6) ||
3530 (base64bits > 0 && base64buffer != 0)) {
3531 outpos = p-PyUnicode_AS_UNICODE(unicode);
3532 endinpos = size;
3533 if (unicode_decode_call_errorhandler(
3534 errors, &errorHandler,
3535 "utf7", "unterminated shift sequence",
3536 &starts, &e, &startinpos, &endinpos, &exc, &s,
3537 &unicode, &outpos, &p))
3538 goto onError;
3539 if (s < e)
3540 goto restart;
3541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003543
3544 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003545 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003546 if (inShift) {
3547 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003548 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003549 }
3550 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003551 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003552 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003553 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554
Victor Stinnerfe226c02011-10-03 03:52:20 +02003555 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003556 goto onError;
3557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 Py_XDECREF(errorHandler);
3559 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003560 if (PyUnicode_READY(unicode) == -1) {
3561 Py_DECREF(unicode);
3562 return NULL;
3563 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003564 return (PyObject *)unicode;
3565
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567 Py_XDECREF(errorHandler);
3568 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003569 Py_DECREF(unicode);
3570 return NULL;
3571}
3572
3573
Alexander Belopolsky40018472011-02-26 01:02:56 +00003574PyObject *
3575PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003576 Py_ssize_t size,
3577 int base64SetO,
3578 int base64WhiteSpace,
3579 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003580{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003581 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003582 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003583 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003584 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003585 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003586 unsigned int base64bits = 0;
3587 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003588 char * out;
3589 char * start;
3590
3591 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003593
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003594 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003595 return PyErr_NoMemory();
3596
Antoine Pitrou244651a2009-05-04 18:56:13 +00003597 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003598 if (v == NULL)
3599 return NULL;
3600
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003601 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003602 for (;i < size; ++i) {
3603 Py_UNICODE ch = s[i];
3604
Antoine Pitrou244651a2009-05-04 18:56:13 +00003605 if (inShift) {
3606 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3607 /* shifting out */
3608 if (base64bits) { /* output remaining bits */
3609 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3610 base64buffer = 0;
3611 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003612 }
3613 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003614 /* Characters not in the BASE64 set implicitly unshift the sequence
3615 so no '-' is required, except if the character is itself a '-' */
3616 if (IS_BASE64(ch) || ch == '-') {
3617 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 *out++ = (char) ch;
3620 }
3621 else {
3622 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003623 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003625 else { /* not in a shift sequence */
3626 if (ch == '+') {
3627 *out++ = '+';
3628 *out++ = '-';
3629 }
3630 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3631 *out++ = (char) ch;
3632 }
3633 else {
3634 *out++ = '+';
3635 inShift = 1;
3636 goto encode_char;
3637 }
3638 }
3639 continue;
3640encode_char:
3641#ifdef Py_UNICODE_WIDE
3642 if (ch >= 0x10000) {
3643 /* code first surrogate */
3644 base64bits += 16;
3645 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3646 while (base64bits >= 6) {
3647 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3648 base64bits -= 6;
3649 }
3650 /* prepare second surrogate */
3651 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3652 }
3653#endif
3654 base64bits += 16;
3655 base64buffer = (base64buffer << 16) | ch;
3656 while (base64bits >= 6) {
3657 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3658 base64bits -= 6;
3659 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003660 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003661 if (base64bits)
3662 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3663 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003664 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003665 if (_PyBytes_Resize(&v, out - start) < 0)
3666 return NULL;
3667 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003668}
3669
Antoine Pitrou244651a2009-05-04 18:56:13 +00003670#undef IS_BASE64
3671#undef FROM_BASE64
3672#undef TO_BASE64
3673#undef DECODE_DIRECT
3674#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676/* --- UTF-8 Codec -------------------------------------------------------- */
3677
Tim Petersced69f82003-09-16 20:30:58 +00003678static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003680 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3681 illegal prefix. See RFC 3629 for details */
3682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3687 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3688 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003689 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003693 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3694 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3695 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3696 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3697 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698};
3699
Alexander Belopolsky40018472011-02-26 01:02:56 +00003700PyObject *
3701PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003702 Py_ssize_t size,
3703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704{
Walter Dörwald69652032004-09-07 20:24:22 +00003705 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3706}
3707
Antoine Pitrouab868312009-01-10 15:40:25 +00003708/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3709#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3710
3711/* Mask to quickly check whether a C 'long' contains a
3712 non-ASCII, UTF8-encoded char. */
3713#if (SIZEOF_LONG == 8)
3714# define ASCII_CHAR_MASK 0x8080808080808080L
3715#elif (SIZEOF_LONG == 4)
3716# define ASCII_CHAR_MASK 0x80808080L
3717#else
3718# error C 'long' size should be either 4 or 8!
3719#endif
3720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721/* Scans a UTF-8 string and returns the maximum character to be expected,
3722 the size of the decoded unicode string and if any major errors were
3723 encountered.
3724
3725 This function does check basic UTF-8 sanity, it does however NOT CHECK
3726 if the string contains surrogates, and if all continuation bytes are
3727 within the correct ranges, these checks are performed in
3728 PyUnicode_DecodeUTF8Stateful.
3729
3730 If it sets has_errors to 1, it means the value of unicode_size and max_char
3731 will be bogus and you should not rely on useful information in them.
3732 */
3733static Py_UCS4
3734utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3735 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3736 int *has_errors)
3737{
3738 Py_ssize_t n;
3739 Py_ssize_t char_count = 0;
3740 Py_UCS4 max_char = 127, new_max;
3741 Py_UCS4 upper_bound;
3742 const unsigned char *p = (const unsigned char *)s;
3743 const unsigned char *end = p + string_size;
3744 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3745 int err = 0;
3746
3747 for (; p < end && !err; ++p, ++char_count) {
3748 /* Only check value if it's not a ASCII char... */
3749 if (*p < 0x80) {
3750 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3751 an explanation. */
3752 if (!((size_t) p & LONG_PTR_MASK)) {
3753 /* Help register allocation */
3754 register const unsigned char *_p = p;
3755 while (_p < aligned_end) {
3756 unsigned long value = *(unsigned long *) _p;
3757 if (value & ASCII_CHAR_MASK)
3758 break;
3759 _p += SIZEOF_LONG;
3760 char_count += SIZEOF_LONG;
3761 }
3762 p = _p;
3763 if (p == end)
3764 break;
3765 }
3766 }
3767 if (*p >= 0x80) {
3768 n = utf8_code_length[*p];
3769 new_max = max_char;
3770 switch (n) {
3771 /* invalid start byte */
3772 case 0:
3773 err = 1;
3774 break;
3775 case 2:
3776 /* Code points between 0x00FF and 0x07FF inclusive.
3777 Approximate the upper bound of the code point,
3778 if this flips over 255 we can be sure it will be more
3779 than 255 and the string will need 2 bytes per code coint,
3780 if it stays under or equal to 255, we can be sure 1 byte
3781 is enough.
3782 ((*p & 0b00011111) << 6) | 0b00111111 */
3783 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3784 if (max_char < upper_bound)
3785 new_max = upper_bound;
3786 /* Ensure we track at least that we left ASCII space. */
3787 if (new_max < 128)
3788 new_max = 128;
3789 break;
3790 case 3:
3791 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3792 always > 255 and <= 65535 and will always need 2 bytes. */
3793 if (max_char < 65535)
3794 new_max = 65535;
3795 break;
3796 case 4:
3797 /* Code point will be above 0xFFFF for sure in this case. */
3798 new_max = 65537;
3799 break;
3800 /* Internal error, this should be caught by the first if */
3801 case 1:
3802 default:
3803 assert(0 && "Impossible case in utf8_max_char_and_size");
3804 err = 1;
3805 }
3806 /* Instead of number of overall bytes for this code point,
3807 n containts the number of following bytes: */
3808 --n;
3809 /* Check if the follow up chars are all valid continuation bytes */
3810 if (n >= 1) {
3811 const unsigned char *cont;
3812 if ((p + n) >= end) {
3813 if (consumed == 0)
3814 /* incomplete data, non-incremental decoding */
3815 err = 1;
3816 break;
3817 }
3818 for (cont = p + 1; cont < (p + n); ++cont) {
3819 if ((*cont & 0xc0) != 0x80) {
3820 err = 1;
3821 break;
3822 }
3823 }
3824 p += n;
3825 }
3826 else
3827 err = 1;
3828 max_char = new_max;
3829 }
3830 }
3831
3832 if (unicode_size)
3833 *unicode_size = char_count;
3834 if (has_errors)
3835 *has_errors = err;
3836 return max_char;
3837}
3838
3839/* Similar to PyUnicode_WRITE but can also write into wstr field
3840 of the legacy unicode representation */
3841#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3842 do { \
3843 const int k_ = (kind); \
3844 if (k_ == PyUnicode_WCHAR_KIND) \
3845 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3846 else if (k_ == PyUnicode_1BYTE_KIND) \
3847 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3848 else if (k_ == PyUnicode_2BYTE_KIND) \
3849 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3850 else \
3851 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3852 } while (0)
3853
Alexander Belopolsky40018472011-02-26 01:02:56 +00003854PyObject *
3855PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856 Py_ssize_t size,
3857 const char *errors,
3858 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003862 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003863 Py_ssize_t startinpos;
3864 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003865 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003867 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 PyObject *errorHandler = NULL;
3869 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870 Py_UCS4 maxchar = 0;
3871 Py_ssize_t unicode_size;
3872 Py_ssize_t i;
3873 int kind;
3874 void *data;
3875 int has_errors;
3876 Py_UNICODE *error_outptr;
3877#if SIZEOF_WCHAR_T == 2
3878 Py_ssize_t wchar_offset = 0;
3879#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880
Walter Dörwald69652032004-09-07 20:24:22 +00003881 if (size == 0) {
3882 if (consumed)
3883 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3887 consumed, &has_errors);
3888 if (has_errors) {
3889 unicode = _PyUnicode_New(size);
3890 if (!unicode)
3891 return NULL;
3892 kind = PyUnicode_WCHAR_KIND;
3893 data = PyUnicode_AS_UNICODE(unicode);
3894 assert(data != NULL);
3895 }
3896 else {
3897 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3898 if (!unicode)
3899 return NULL;
3900 /* When the string is ASCII only, just use memcpy and return.
3901 unicode_size may be != size if there is an incomplete UTF-8
3902 sequence at the end of the ASCII block. */
3903 if (maxchar < 128 && size == unicode_size) {
3904 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3905 return (PyObject *)unicode;
3906 }
3907 kind = PyUnicode_KIND(unicode);
3908 data = PyUnicode_DATA(unicode);
3909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003913 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914
3915 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003916 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917
3918 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003919 /* Fast path for runs of ASCII characters. Given that common UTF-8
3920 input will consist of an overwhelming majority of ASCII
3921 characters, we try to optimize for this case by checking
3922 as many characters as a C 'long' can contain.
3923 First, check if we can do an aligned read, as most CPUs have
3924 a penalty for unaligned reads.
3925 */
3926 if (!((size_t) s & LONG_PTR_MASK)) {
3927 /* Help register allocation */
3928 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003930 while (_s < aligned_end) {
3931 /* Read a whole long at a time (either 4 or 8 bytes),
3932 and do a fast unrolled copy if it only contains ASCII
3933 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 unsigned long value = *(unsigned long *) _s;
3935 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003936 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3938 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3939 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3940 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003941#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3943 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3944 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3945 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003946#endif
3947 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003949 }
3950 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003952 if (s == e)
3953 break;
3954 ch = (unsigned char)*s;
3955 }
3956 }
3957
3958 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003960 s++;
3961 continue;
3962 }
3963
3964 n = utf8_code_length[ch];
3965
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003966 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 if (consumed)
3968 break;
3969 else {
3970 errmsg = "unexpected end of data";
3971 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003972 endinpos = startinpos+1;
3973 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3974 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 goto utf8Error;
3976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978
3979 switch (n) {
3980
3981 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003982 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 startinpos = s-starts;
3984 endinpos = startinpos+1;
3985 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986
3987 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003988 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 startinpos = s-starts;
3990 endinpos = startinpos+1;
3991 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992
3993 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003994 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003995 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003997 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 goto utf8Error;
3999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004001 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 break;
4004
4005 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004006 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4007 will result in surrogates in range d800-dfff. Surrogates are
4008 not valid UTF-8 so they are rejected.
4009 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4010 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004011 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004012 (s[2] & 0xc0) != 0x80 ||
4013 ((unsigned char)s[0] == 0xE0 &&
4014 (unsigned char)s[1] < 0xA0) ||
4015 ((unsigned char)s[0] == 0xED &&
4016 (unsigned char)s[1] > 0x9F)) {
4017 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004018 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004019 endinpos = startinpos + 1;
4020
4021 /* if s[1] first two bits are 1 and 0, then the invalid
4022 continuation byte is s[2], so increment endinpos by 1,
4023 if not, s[1] is invalid and endinpos doesn't need to
4024 be incremented. */
4025 if ((s[1] & 0xC0) == 0x80)
4026 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004027 goto utf8Error;
4028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004030 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004031 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004032 break;
4033
4034 case 4:
4035 if ((s[1] & 0xc0) != 0x80 ||
4036 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004037 (s[3] & 0xc0) != 0x80 ||
4038 ((unsigned char)s[0] == 0xF0 &&
4039 (unsigned char)s[1] < 0x90) ||
4040 ((unsigned char)s[0] == 0xF4 &&
4041 (unsigned char)s[1] > 0x8F)) {
4042 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004044 endinpos = startinpos + 1;
4045 if ((s[1] & 0xC0) == 0x80) {
4046 endinpos++;
4047 if ((s[2] & 0xC0) == 0x80)
4048 endinpos++;
4049 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 goto utf8Error;
4051 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004052 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004053 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4054 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 /* If the string is flexible or we have native UCS-4, write
4057 directly.. */
4058 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4059 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 else {
4062 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* translate from 10000..10FFFF to 0..FFFF */
4065 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 /* high surrogate = top 10 bits added to D800 */
4068 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4069 (Py_UNICODE)(0xD800 + (ch >> 10)));
4070
4071 /* low surrogate = bottom 10 bits added to DC00 */
4072 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4073 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4074 }
4075#if SIZEOF_WCHAR_T == 2
4076 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004077#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 }
4080 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004082
Benjamin Peterson29060642009-01-31 22:14:21 +00004083 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004084 /* If this is not yet a resizable string, make it one.. */
4085 if (kind != PyUnicode_WCHAR_KIND) {
4086 const Py_UNICODE *u;
4087 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4088 if (!new_unicode)
4089 goto onError;
4090 u = PyUnicode_AsUnicode((PyObject *)unicode);
4091 if (!u)
4092 goto onError;
4093#if SIZEOF_WCHAR_T == 2
4094 i += wchar_offset;
4095#endif
4096 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4097 Py_DECREF(unicode);
4098 unicode = new_unicode;
4099 kind = 0;
4100 data = PyUnicode_AS_UNICODE(new_unicode);
4101 assert(data != NULL);
4102 }
4103 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004104 if (unicode_decode_call_errorhandler(
4105 errors, &errorHandler,
4106 "utf8", errmsg,
4107 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004109 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 /* Update data because unicode_decode_call_errorhandler might have
4111 re-created or resized the unicode object. */
4112 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 /* Ensure the unicode_size calculation above was correct: */
4116 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4117
Walter Dörwald69652032004-09-07 20:24:22 +00004118 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004119 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121 /* Adjust length and ready string when it contained errors and
4122 is of the old resizable kind. */
4123 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004124 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 PyUnicode_READY(unicode) == -1)
4126 goto onError;
4127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 Py_XDECREF(errorHandler);
4130 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 if (PyUnicode_READY(unicode) == -1) {
4132 Py_DECREF(unicode);
4133 return NULL;
4134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 return (PyObject *)unicode;
4136
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004138 Py_XDECREF(errorHandler);
4139 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 Py_DECREF(unicode);
4141 return NULL;
4142}
4143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004145
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004146#ifdef __APPLE__
4147
4148/* Simplified UTF-8 decoder using surrogateescape error handler,
4149 used to decode the command line arguments on Mac OS X. */
4150
4151wchar_t*
4152_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4153{
4154 int n;
4155 const char *e;
4156 wchar_t *unicode, *p;
4157
4158 /* Note: size will always be longer than the resulting Unicode
4159 character count */
4160 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4161 PyErr_NoMemory();
4162 return NULL;
4163 }
4164 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4165 if (!unicode)
4166 return NULL;
4167
4168 /* Unpack UTF-8 encoded data */
4169 p = unicode;
4170 e = s + size;
4171 while (s < e) {
4172 Py_UCS4 ch = (unsigned char)*s;
4173
4174 if (ch < 0x80) {
4175 *p++ = (wchar_t)ch;
4176 s++;
4177 continue;
4178 }
4179
4180 n = utf8_code_length[ch];
4181 if (s + n > e) {
4182 goto surrogateescape;
4183 }
4184
4185 switch (n) {
4186 case 0:
4187 case 1:
4188 goto surrogateescape;
4189
4190 case 2:
4191 if ((s[1] & 0xc0) != 0x80)
4192 goto surrogateescape;
4193 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4194 assert ((ch > 0x007F) && (ch <= 0x07FF));
4195 *p++ = (wchar_t)ch;
4196 break;
4197
4198 case 3:
4199 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4200 will result in surrogates in range d800-dfff. Surrogates are
4201 not valid UTF-8 so they are rejected.
4202 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4203 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4204 if ((s[1] & 0xc0) != 0x80 ||
4205 (s[2] & 0xc0) != 0x80 ||
4206 ((unsigned char)s[0] == 0xE0 &&
4207 (unsigned char)s[1] < 0xA0) ||
4208 ((unsigned char)s[0] == 0xED &&
4209 (unsigned char)s[1] > 0x9F)) {
4210
4211 goto surrogateescape;
4212 }
4213 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4214 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004215 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004216 break;
4217
4218 case 4:
4219 if ((s[1] & 0xc0) != 0x80 ||
4220 (s[2] & 0xc0) != 0x80 ||
4221 (s[3] & 0xc0) != 0x80 ||
4222 ((unsigned char)s[0] == 0xF0 &&
4223 (unsigned char)s[1] < 0x90) ||
4224 ((unsigned char)s[0] == 0xF4 &&
4225 (unsigned char)s[1] > 0x8F)) {
4226 goto surrogateescape;
4227 }
4228 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4229 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4230 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4231
4232#if SIZEOF_WCHAR_T == 4
4233 *p++ = (wchar_t)ch;
4234#else
4235 /* compute and append the two surrogates: */
4236
4237 /* translate from 10000..10FFFF to 0..FFFF */
4238 ch -= 0x10000;
4239
4240 /* high surrogate = top 10 bits added to D800 */
4241 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4242
4243 /* low surrogate = bottom 10 bits added to DC00 */
4244 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4245#endif
4246 break;
4247 }
4248 s += n;
4249 continue;
4250
4251 surrogateescape:
4252 *p++ = 0xDC00 + ch;
4253 s++;
4254 }
4255 *p = L'\0';
4256 return unicode;
4257}
4258
4259#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004261/* Primary internal function which creates utf8 encoded bytes objects.
4262
4263 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004264 and allocate exactly as much space needed at the end. Else allocate the
4265 maximum possible needed (4 result bytes per Unicode character), and return
4266 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004267*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004268PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004269_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270{
Tim Peters602f7402002-04-27 18:03:26 +00004271#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004272
Guido van Rossum98297ee2007-11-06 21:34:58 +00004273 Py_ssize_t i; /* index into s of next input byte */
4274 PyObject *result; /* result string object */
4275 char *p; /* next free byte in output buffer */
4276 Py_ssize_t nallocated; /* number of result bytes allocated */
4277 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004278 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004279 PyObject *errorHandler = NULL;
4280 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 int kind;
4282 void *data;
4283 Py_ssize_t size;
4284 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4285#if SIZEOF_WCHAR_T == 2
4286 Py_ssize_t wchar_offset = 0;
4287#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289 if (!PyUnicode_Check(unicode)) {
4290 PyErr_BadArgument();
4291 return NULL;
4292 }
4293
4294 if (PyUnicode_READY(unicode) == -1)
4295 return NULL;
4296
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004297 if (PyUnicode_UTF8(unicode))
4298 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4299 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004300
4301 kind = PyUnicode_KIND(unicode);
4302 data = PyUnicode_DATA(unicode);
4303 size = PyUnicode_GET_LENGTH(unicode);
4304
Tim Peters602f7402002-04-27 18:03:26 +00004305 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306
Tim Peters602f7402002-04-27 18:03:26 +00004307 if (size <= MAX_SHORT_UNICHARS) {
4308 /* Write into the stack buffer; nallocated can't overflow.
4309 * At the end, we'll allocate exactly as much heap space as it
4310 * turns out we need.
4311 */
4312 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004313 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004314 p = stackbuf;
4315 }
4316 else {
4317 /* Overallocate on the heap, and give the excess back at the end. */
4318 nallocated = size * 4;
4319 if (nallocated / 4 != size) /* overflow! */
4320 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004321 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004322 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004323 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004324 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004325 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004326
Tim Peters602f7402002-04-27 18:03:26 +00004327 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004329
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004330 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004331 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004333
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004335 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004336 *p++ = (char)(0xc0 | (ch >> 6));
4337 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004338 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 Py_ssize_t newpos;
4340 PyObject *rep;
4341 Py_ssize_t repsize, k, startpos;
4342 startpos = i-1;
4343#if SIZEOF_WCHAR_T == 2
4344 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004345#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004346 rep = unicode_encode_call_errorhandler(
4347 errors, &errorHandler, "utf-8", "surrogates not allowed",
4348 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4349 &exc, startpos, startpos+1, &newpos);
4350 if (!rep)
4351 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 if (PyBytes_Check(rep))
4354 repsize = PyBytes_GET_SIZE(rep);
4355 else
4356 repsize = PyUnicode_GET_SIZE(rep);
4357
4358 if (repsize > 4) {
4359 Py_ssize_t offset;
4360
4361 if (result == NULL)
4362 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004363 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004366 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4367 /* integer overflow */
4368 PyErr_NoMemory();
4369 goto error;
4370 }
4371 nallocated += repsize - 4;
4372 if (result != NULL) {
4373 if (_PyBytes_Resize(&result, nallocated) < 0)
4374 goto error;
4375 } else {
4376 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004377 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004378 goto error;
4379 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4380 }
4381 p = PyBytes_AS_STRING(result) + offset;
4382 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004384 if (PyBytes_Check(rep)) {
4385 char *prep = PyBytes_AS_STRING(rep);
4386 for(k = repsize; k > 0; k--)
4387 *p++ = *prep++;
4388 } else /* rep is unicode */ {
4389 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4390 Py_UNICODE c;
4391
4392 for(k=0; k<repsize; k++) {
4393 c = prep[k];
4394 if (0x80 <= c) {
4395 raise_encode_exception(&exc, "utf-8",
4396 PyUnicode_AS_UNICODE(unicode),
4397 size, i-1, i,
4398 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004399 goto error;
4400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004401 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004402 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004404 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004405 } else if (ch < 0x10000) {
4406 *p++ = (char)(0xe0 | (ch >> 12));
4407 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4408 *p++ = (char)(0x80 | (ch & 0x3f));
4409 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004410 /* Encode UCS4 Unicode ordinals */
4411 *p++ = (char)(0xf0 | (ch >> 18));
4412 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4414 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004415#if SIZEOF_WCHAR_T == 2
4416 wchar_offset++;
4417#endif
Tim Peters602f7402002-04-27 18:03:26 +00004418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004420
Guido van Rossum98297ee2007-11-06 21:34:58 +00004421 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004422 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004423 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004424 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004425 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004426 }
4427 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004428 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004429 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004430 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004431 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004434 Py_XDECREF(errorHandler);
4435 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004436 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004437 error:
4438 Py_XDECREF(errorHandler);
4439 Py_XDECREF(exc);
4440 Py_XDECREF(result);
4441 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004442
Tim Peters602f7402002-04-27 18:03:26 +00004443#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444}
4445
Alexander Belopolsky40018472011-02-26 01:02:56 +00004446PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004447PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4448 Py_ssize_t size,
4449 const char *errors)
4450{
4451 PyObject *v, *unicode;
4452
4453 unicode = PyUnicode_FromUnicode(s, size);
4454 if (unicode == NULL)
4455 return NULL;
4456 v = _PyUnicode_AsUTF8String(unicode, errors);
4457 Py_DECREF(unicode);
4458 return v;
4459}
4460
4461PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004462PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465}
4466
Walter Dörwald41980ca2007-08-16 21:55:45 +00004467/* --- UTF-32 Codec ------------------------------------------------------- */
4468
4469PyObject *
4470PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 Py_ssize_t size,
4472 const char *errors,
4473 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004474{
4475 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4476}
4477
4478PyObject *
4479PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 Py_ssize_t size,
4481 const char *errors,
4482 int *byteorder,
4483 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004484{
4485 const char *starts = s;
4486 Py_ssize_t startinpos;
4487 Py_ssize_t endinpos;
4488 Py_ssize_t outpos;
4489 PyUnicodeObject *unicode;
4490 Py_UNICODE *p;
4491#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004492 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004493 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004494#else
4495 const int pairs = 0;
4496#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004497 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004498 int bo = 0; /* assume native ordering by default */
4499 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004500 /* Offsets from q for retrieving bytes in the right order. */
4501#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4502 int iorder[] = {0, 1, 2, 3};
4503#else
4504 int iorder[] = {3, 2, 1, 0};
4505#endif
4506 PyObject *errorHandler = NULL;
4507 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004508
Walter Dörwald41980ca2007-08-16 21:55:45 +00004509 q = (unsigned char *)s;
4510 e = q + size;
4511
4512 if (byteorder)
4513 bo = *byteorder;
4514
4515 /* Check for BOM marks (U+FEFF) in the input and adjust current
4516 byte order setting accordingly. In native mode, the leading BOM
4517 mark is skipped, in all other modes, it is copied to the output
4518 stream as-is (giving a ZWNBSP character). */
4519 if (bo == 0) {
4520 if (size >= 4) {
4521 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004523#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 if (bom == 0x0000FEFF) {
4525 q += 4;
4526 bo = -1;
4527 }
4528 else if (bom == 0xFFFE0000) {
4529 q += 4;
4530 bo = 1;
4531 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004532#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 if (bom == 0x0000FEFF) {
4534 q += 4;
4535 bo = 1;
4536 }
4537 else if (bom == 0xFFFE0000) {
4538 q += 4;
4539 bo = -1;
4540 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004541#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004543 }
4544
4545 if (bo == -1) {
4546 /* force LE */
4547 iorder[0] = 0;
4548 iorder[1] = 1;
4549 iorder[2] = 2;
4550 iorder[3] = 3;
4551 }
4552 else if (bo == 1) {
4553 /* force BE */
4554 iorder[0] = 3;
4555 iorder[1] = 2;
4556 iorder[2] = 1;
4557 iorder[3] = 0;
4558 }
4559
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004560 /* On narrow builds we split characters outside the BMP into two
4561 codepoints => count how much extra space we need. */
4562#ifndef Py_UNICODE_WIDE
4563 for (qq = q; qq < e; qq += 4)
4564 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4565 pairs++;
4566#endif
4567
4568 /* This might be one to much, because of a BOM */
4569 unicode = _PyUnicode_New((size+3)/4+pairs);
4570 if (!unicode)
4571 return NULL;
4572 if (size == 0)
4573 return (PyObject *)unicode;
4574
4575 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004576 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004577
Walter Dörwald41980ca2007-08-16 21:55:45 +00004578 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 Py_UCS4 ch;
4580 /* remaining bytes at the end? (size should be divisible by 4) */
4581 if (e-q<4) {
4582 if (consumed)
4583 break;
4584 errmsg = "truncated data";
4585 startinpos = ((const char *)q)-starts;
4586 endinpos = ((const char *)e)-starts;
4587 goto utf32Error;
4588 /* The remaining input chars are ignored if the callback
4589 chooses to skip the input */
4590 }
4591 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4592 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004593
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 if (ch >= 0x110000)
4595 {
4596 errmsg = "codepoint not in range(0x110000)";
4597 startinpos = ((const char *)q)-starts;
4598 endinpos = startinpos+4;
4599 goto utf32Error;
4600 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004601#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 if (ch >= 0x10000)
4603 {
4604 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4605 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4606 }
4607 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004608#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 *p++ = ch;
4610 q += 4;
4611 continue;
4612 utf32Error:
4613 outpos = p-PyUnicode_AS_UNICODE(unicode);
4614 if (unicode_decode_call_errorhandler(
4615 errors, &errorHandler,
4616 "utf32", errmsg,
4617 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4618 &unicode, &outpos, &p))
4619 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004620 }
4621
4622 if (byteorder)
4623 *byteorder = bo;
4624
4625 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004626 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004627
4628 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004629 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004630 goto onError;
4631
4632 Py_XDECREF(errorHandler);
4633 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004634 if (PyUnicode_READY(unicode) == -1) {
4635 Py_DECREF(unicode);
4636 return NULL;
4637 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004638 return (PyObject *)unicode;
4639
Benjamin Peterson29060642009-01-31 22:14:21 +00004640 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004641 Py_DECREF(unicode);
4642 Py_XDECREF(errorHandler);
4643 Py_XDECREF(exc);
4644 return NULL;
4645}
4646
4647PyObject *
4648PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 Py_ssize_t size,
4650 const char *errors,
4651 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004652{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004653 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004654 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004655 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004656#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004657 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004658#else
4659 const int pairs = 0;
4660#endif
4661 /* Offsets from p for storing byte pairs in the right order. */
4662#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4663 int iorder[] = {0, 1, 2, 3};
4664#else
4665 int iorder[] = {3, 2, 1, 0};
4666#endif
4667
Benjamin Peterson29060642009-01-31 22:14:21 +00004668#define STORECHAR(CH) \
4669 do { \
4670 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4671 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4672 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4673 p[iorder[0]] = (CH) & 0xff; \
4674 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004675 } while(0)
4676
4677 /* In narrow builds we can output surrogate pairs as one codepoint,
4678 so we need less space. */
4679#ifndef Py_UNICODE_WIDE
4680 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4682 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4683 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004684#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004685 nsize = (size - pairs + (byteorder == 0));
4686 bytesize = nsize * 4;
4687 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004689 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004690 if (v == NULL)
4691 return NULL;
4692
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004693 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004694 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004696 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004697 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004698
4699 if (byteorder == -1) {
4700 /* force LE */
4701 iorder[0] = 0;
4702 iorder[1] = 1;
4703 iorder[2] = 2;
4704 iorder[3] = 3;
4705 }
4706 else if (byteorder == 1) {
4707 /* force BE */
4708 iorder[0] = 3;
4709 iorder[1] = 2;
4710 iorder[2] = 1;
4711 iorder[3] = 0;
4712 }
4713
4714 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004716#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004717 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4718 Py_UCS4 ch2 = *s;
4719 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4720 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4721 s++;
4722 size--;
4723 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004724 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004725#endif
4726 STORECHAR(ch);
4727 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004728
4729 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004730 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004731#undef STORECHAR
4732}
4733
Alexander Belopolsky40018472011-02-26 01:02:56 +00004734PyObject *
4735PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004736{
4737 if (!PyUnicode_Check(unicode)) {
4738 PyErr_BadArgument();
4739 return NULL;
4740 }
4741 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 PyUnicode_GET_SIZE(unicode),
4743 NULL,
4744 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004745}
4746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747/* --- UTF-16 Codec ------------------------------------------------------- */
4748
Tim Peters772747b2001-08-09 22:21:55 +00004749PyObject *
4750PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 Py_ssize_t size,
4752 const char *errors,
4753 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754{
Walter Dörwald69652032004-09-07 20:24:22 +00004755 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4756}
4757
Antoine Pitrouab868312009-01-10 15:40:25 +00004758/* Two masks for fast checking of whether a C 'long' may contain
4759 UTF16-encoded surrogate characters. This is an efficient heuristic,
4760 assuming that non-surrogate characters with a code point >= 0x8000 are
4761 rare in most input.
4762 FAST_CHAR_MASK is used when the input is in native byte ordering,
4763 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004764*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004765#if (SIZEOF_LONG == 8)
4766# define FAST_CHAR_MASK 0x8000800080008000L
4767# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4768#elif (SIZEOF_LONG == 4)
4769# define FAST_CHAR_MASK 0x80008000L
4770# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4771#else
4772# error C 'long' size should be either 4 or 8!
4773#endif
4774
Walter Dörwald69652032004-09-07 20:24:22 +00004775PyObject *
4776PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 Py_ssize_t size,
4778 const char *errors,
4779 int *byteorder,
4780 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004781{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004782 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783 Py_ssize_t startinpos;
4784 Py_ssize_t endinpos;
4785 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 PyUnicodeObject *unicode;
4787 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004788 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004789 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004790 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004791 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004792 /* Offsets from q for retrieving byte pairs in the right order. */
4793#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4794 int ihi = 1, ilo = 0;
4795#else
4796 int ihi = 0, ilo = 1;
4797#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 PyObject *errorHandler = NULL;
4799 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
4801 /* Note: size will always be longer than the resulting Unicode
4802 character count */
4803 unicode = _PyUnicode_New(size);
4804 if (!unicode)
4805 return NULL;
4806 if (size == 0)
4807 return (PyObject *)unicode;
4808
4809 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004811 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004812 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813
4814 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004815 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004817 /* Check for BOM marks (U+FEFF) in the input and adjust current
4818 byte order setting accordingly. In native mode, the leading BOM
4819 mark is skipped, in all other modes, it is copied to the output
4820 stream as-is (giving a ZWNBSP character). */
4821 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004822 if (size >= 2) {
4823 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004824#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 if (bom == 0xFEFF) {
4826 q += 2;
4827 bo = -1;
4828 }
4829 else if (bom == 0xFFFE) {
4830 q += 2;
4831 bo = 1;
4832 }
Tim Petersced69f82003-09-16 20:30:58 +00004833#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004834 if (bom == 0xFEFF) {
4835 q += 2;
4836 bo = 1;
4837 }
4838 else if (bom == 0xFFFE) {
4839 q += 2;
4840 bo = -1;
4841 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004842#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845
Tim Peters772747b2001-08-09 22:21:55 +00004846 if (bo == -1) {
4847 /* force LE */
4848 ihi = 1;
4849 ilo = 0;
4850 }
4851 else if (bo == 1) {
4852 /* force BE */
4853 ihi = 0;
4854 ilo = 1;
4855 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004856#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4857 native_ordering = ilo < ihi;
4858#else
4859 native_ordering = ilo > ihi;
4860#endif
Tim Peters772747b2001-08-09 22:21:55 +00004861
Antoine Pitrouab868312009-01-10 15:40:25 +00004862 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004863 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004865 /* First check for possible aligned read of a C 'long'. Unaligned
4866 reads are more expensive, better to defer to another iteration. */
4867 if (!((size_t) q & LONG_PTR_MASK)) {
4868 /* Fast path for runs of non-surrogate chars. */
4869 register const unsigned char *_q = q;
4870 Py_UNICODE *_p = p;
4871 if (native_ordering) {
4872 /* Native ordering is simple: as long as the input cannot
4873 possibly contain a surrogate char, do an unrolled copy
4874 of several 16-bit code points to the target object.
4875 The non-surrogate check is done on several input bytes
4876 at a time (as many as a C 'long' can contain). */
4877 while (_q < aligned_end) {
4878 unsigned long data = * (unsigned long *) _q;
4879 if (data & FAST_CHAR_MASK)
4880 break;
4881 _p[0] = ((unsigned short *) _q)[0];
4882 _p[1] = ((unsigned short *) _q)[1];
4883#if (SIZEOF_LONG == 8)
4884 _p[2] = ((unsigned short *) _q)[2];
4885 _p[3] = ((unsigned short *) _q)[3];
4886#endif
4887 _q += SIZEOF_LONG;
4888 _p += SIZEOF_LONG / 2;
4889 }
4890 }
4891 else {
4892 /* Byteswapped ordering is similar, but we must decompose
4893 the copy bytewise, and take care of zero'ing out the
4894 upper bytes if the target object is in 32-bit units
4895 (that is, in UCS-4 builds). */
4896 while (_q < aligned_end) {
4897 unsigned long data = * (unsigned long *) _q;
4898 if (data & SWAPPED_FAST_CHAR_MASK)
4899 break;
4900 /* Zero upper bytes in UCS-4 builds */
4901#if (Py_UNICODE_SIZE > 2)
4902 _p[0] = 0;
4903 _p[1] = 0;
4904#if (SIZEOF_LONG == 8)
4905 _p[2] = 0;
4906 _p[3] = 0;
4907#endif
4908#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004909 /* Issue #4916; UCS-4 builds on big endian machines must
4910 fill the two last bytes of each 4-byte unit. */
4911#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4912# define OFF 2
4913#else
4914# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004915#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004916 ((unsigned char *) _p)[OFF + 1] = _q[0];
4917 ((unsigned char *) _p)[OFF + 0] = _q[1];
4918 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4919 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4920#if (SIZEOF_LONG == 8)
4921 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4922 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4923 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4924 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4925#endif
4926#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004927 _q += SIZEOF_LONG;
4928 _p += SIZEOF_LONG / 2;
4929 }
4930 }
4931 p = _p;
4932 q = _q;
4933 if (q >= e)
4934 break;
4935 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937
Benjamin Peterson14339b62009-01-31 16:36:08 +00004938 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004939
4940 if (ch < 0xD800 || ch > 0xDFFF) {
4941 *p++ = ch;
4942 continue;
4943 }
4944
4945 /* UTF-16 code pair: */
4946 if (q > e) {
4947 errmsg = "unexpected end of data";
4948 startinpos = (((const char *)q) - 2) - starts;
4949 endinpos = ((const char *)e) + 1 - starts;
4950 goto utf16Error;
4951 }
4952 if (0xD800 <= ch && ch <= 0xDBFF) {
4953 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4954 q += 2;
4955 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004956#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 *p++ = ch;
4958 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004959#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004961#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004962 continue;
4963 }
4964 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004965 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 startinpos = (((const char *)q)-4)-starts;
4967 endinpos = startinpos+2;
4968 goto utf16Error;
4969 }
4970
Benjamin Peterson14339b62009-01-31 16:36:08 +00004971 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 errmsg = "illegal encoding";
4973 startinpos = (((const char *)q)-2)-starts;
4974 endinpos = startinpos+2;
4975 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004976
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 utf16Error:
4978 outpos = p - PyUnicode_AS_UNICODE(unicode);
4979 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004980 errors,
4981 &errorHandler,
4982 "utf16", errmsg,
4983 &starts,
4984 (const char **)&e,
4985 &startinpos,
4986 &endinpos,
4987 &exc,
4988 (const char **)&q,
4989 &unicode,
4990 &outpos,
4991 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004994 /* remaining byte at the end? (size should be even) */
4995 if (e == q) {
4996 if (!consumed) {
4997 errmsg = "truncated data";
4998 startinpos = ((const char *)q) - starts;
4999 endinpos = ((const char *)e) + 1 - starts;
5000 outpos = p - PyUnicode_AS_UNICODE(unicode);
5001 if (unicode_decode_call_errorhandler(
5002 errors,
5003 &errorHandler,
5004 "utf16", errmsg,
5005 &starts,
5006 (const char **)&e,
5007 &startinpos,
5008 &endinpos,
5009 &exc,
5010 (const char **)&q,
5011 &unicode,
5012 &outpos,
5013 &p))
5014 goto onError;
5015 /* The remaining input chars are ignored if the callback
5016 chooses to skip the input */
5017 }
5018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005019
5020 if (byteorder)
5021 *byteorder = bo;
5022
Walter Dörwald69652032004-09-07 20:24:22 +00005023 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005025
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005027 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 goto onError;
5029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005030 Py_XDECREF(errorHandler);
5031 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005032 if (PyUnicode_READY(unicode) == -1) {
5033 Py_DECREF(unicode);
5034 return NULL;
5035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 return (PyObject *)unicode;
5037
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005040 Py_XDECREF(errorHandler);
5041 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 return NULL;
5043}
5044
Antoine Pitrouab868312009-01-10 15:40:25 +00005045#undef FAST_CHAR_MASK
5046#undef SWAPPED_FAST_CHAR_MASK
5047
Tim Peters772747b2001-08-09 22:21:55 +00005048PyObject *
5049PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 Py_ssize_t size,
5051 const char *errors,
5052 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005054 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005055 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005056 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005057#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005058 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005059#else
5060 const int pairs = 0;
5061#endif
Tim Peters772747b2001-08-09 22:21:55 +00005062 /* Offsets from p for storing byte pairs in the right order. */
5063#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5064 int ihi = 1, ilo = 0;
5065#else
5066 int ihi = 0, ilo = 1;
5067#endif
5068
Benjamin Peterson29060642009-01-31 22:14:21 +00005069#define STORECHAR(CH) \
5070 do { \
5071 p[ihi] = ((CH) >> 8) & 0xff; \
5072 p[ilo] = (CH) & 0xff; \
5073 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005074 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005076#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005077 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 if (s[i] >= 0x10000)
5079 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005080#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005081 /* 2 * (size + pairs + (byteorder == 0)) */
5082 if (size > PY_SSIZE_T_MAX ||
5083 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005085 nsize = size + pairs + (byteorder == 0);
5086 bytesize = nsize * 2;
5087 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005089 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 if (v == NULL)
5091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005093 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005096 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005097 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005098
5099 if (byteorder == -1) {
5100 /* force LE */
5101 ihi = 1;
5102 ilo = 0;
5103 }
5104 else if (byteorder == 1) {
5105 /* force BE */
5106 ihi = 0;
5107 ilo = 1;
5108 }
5109
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005110 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 Py_UNICODE ch = *s++;
5112 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005113#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 if (ch >= 0x10000) {
5115 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5116 ch = 0xD800 | ((ch-0x10000) >> 10);
5117 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005118#endif
Tim Peters772747b2001-08-09 22:21:55 +00005119 STORECHAR(ch);
5120 if (ch2)
5121 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005122 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005123
5124 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005125 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005126#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127}
5128
Alexander Belopolsky40018472011-02-26 01:02:56 +00005129PyObject *
5130PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131{
5132 if (!PyUnicode_Check(unicode)) {
5133 PyErr_BadArgument();
5134 return NULL;
5135 }
5136 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 PyUnicode_GET_SIZE(unicode),
5138 NULL,
5139 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140}
5141
5142/* --- Unicode Escape Codec ----------------------------------------------- */
5143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005144/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5145 if all the escapes in the string make it still a valid ASCII string.
5146 Returns -1 if any escapes were found which cause the string to
5147 pop out of ASCII range. Otherwise returns the length of the
5148 required buffer to hold the string.
5149 */
5150Py_ssize_t
5151length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5152{
5153 const unsigned char *p = (const unsigned char *)s;
5154 const unsigned char *end = p + size;
5155 Py_ssize_t length = 0;
5156
5157 if (size < 0)
5158 return -1;
5159
5160 for (; p < end; ++p) {
5161 if (*p > 127) {
5162 /* Non-ASCII */
5163 return -1;
5164 }
5165 else if (*p != '\\') {
5166 /* Normal character */
5167 ++length;
5168 }
5169 else {
5170 /* Backslash-escape, check next char */
5171 ++p;
5172 /* Escape sequence reaches till end of string or
5173 non-ASCII follow-up. */
5174 if (p >= end || *p > 127)
5175 return -1;
5176 switch (*p) {
5177 case '\n':
5178 /* backslash + \n result in zero characters */
5179 break;
5180 case '\\': case '\'': case '\"':
5181 case 'b': case 'f': case 't':
5182 case 'n': case 'r': case 'v': case 'a':
5183 ++length;
5184 break;
5185 case '0': case '1': case '2': case '3':
5186 case '4': case '5': case '6': case '7':
5187 case 'x': case 'u': case 'U': case 'N':
5188 /* these do not guarantee ASCII characters */
5189 return -1;
5190 default:
5191 /* count the backslash + the other character */
5192 length += 2;
5193 }
5194 }
5195 }
5196 return length;
5197}
5198
5199/* Similar to PyUnicode_WRITE but either write into wstr field
5200 or treat string as ASCII. */
5201#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5202 do { \
5203 if ((kind) != PyUnicode_WCHAR_KIND) \
5204 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5205 else \
5206 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5207 } while (0)
5208
5209#define WRITE_WSTR(buf, index, value) \
5210 assert(kind == PyUnicode_WCHAR_KIND), \
5211 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5212
5213
Fredrik Lundh06d12682001-01-24 07:59:11 +00005214static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005215
Alexander Belopolsky40018472011-02-26 01:02:56 +00005216PyObject *
5217PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005218 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005219 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005221 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005222 Py_ssize_t startinpos;
5223 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005224 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005228 char* message;
5229 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 PyObject *errorHandler = NULL;
5231 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005232 Py_ssize_t ascii_length;
5233 Py_ssize_t i;
5234 int kind;
5235 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005237 ascii_length = length_of_escaped_ascii_string(s, size);
5238
5239 /* After length_of_escaped_ascii_string() there are two alternatives,
5240 either the string is pure ASCII with named escapes like \n, etc.
5241 and we determined it's exact size (common case)
5242 or it contains \x, \u, ... escape sequences. then we create a
5243 legacy wchar string and resize it at the end of this function. */
5244 if (ascii_length >= 0) {
5245 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5246 if (!v)
5247 goto onError;
5248 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5249 kind = PyUnicode_1BYTE_KIND;
5250 data = PyUnicode_DATA(v);
5251 }
5252 else {
5253 /* Escaped strings will always be longer than the resulting
5254 Unicode string, so we start with size here and then reduce the
5255 length after conversion to the true value.
5256 (but if the error callback returns a long replacement string
5257 we'll have to allocate more space) */
5258 v = _PyUnicode_New(size);
5259 if (!v)
5260 goto onError;
5261 kind = PyUnicode_WCHAR_KIND;
5262 data = PyUnicode_AS_UNICODE(v);
5263 }
5264
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 if (size == 0)
5266 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005267 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005269
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 while (s < end) {
5271 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005272 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005273 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005275 if (kind == PyUnicode_WCHAR_KIND) {
5276 assert(i < _PyUnicode_WSTR_LENGTH(v));
5277 }
5278 else {
5279 /* The only case in which i == ascii_length is a backslash
5280 followed by a newline. */
5281 assert(i <= ascii_length);
5282 }
5283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 /* Non-escape characters are interpreted as Unicode ordinals */
5285 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005286 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 continue;
5288 }
5289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 /* \ - Escapes */
5292 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005293 c = *s++;
5294 if (s > end)
5295 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005296
5297 if (kind == PyUnicode_WCHAR_KIND) {
5298 assert(i < _PyUnicode_WSTR_LENGTH(v));
5299 }
5300 else {
5301 /* The only case in which i == ascii_length is a backslash
5302 followed by a newline. */
5303 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5304 }
5305
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005306 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Benjamin Peterson29060642009-01-31 22:14:21 +00005308 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005310 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5311 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5312 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5313 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5314 /* FF */
5315 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5316 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5317 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5318 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5319 /* VT */
5320 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5321 /* BEL, not classic C */
5322 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 case '0': case '1': case '2': case '3':
5326 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005327 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005328 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005329 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005330 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005331 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005333 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 break;
5335
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 /* hex escapes */
5337 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005339 digits = 2;
5340 message = "truncated \\xXX escape";
5341 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005345 digits = 4;
5346 message = "truncated \\uXXXX escape";
5347 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005350 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005351 digits = 8;
5352 message = "truncated \\UXXXXXXXX escape";
5353 hexescape:
5354 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005355 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356 if (s+digits>end) {
5357 endinpos = size;
5358 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 errors, &errorHandler,
5360 "unicodeescape", "end of string in escape sequence",
5361 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 goto nextByte;
5366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 for (j = 0; j < digits; ++j) {
5368 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005369 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005370 endinpos = (s+j+1)-starts;
5371 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 errors, &errorHandler,
5374 "unicodeescape", message,
5375 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005376 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005377 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005380 }
5381 chr = (chr<<4) & ~0xF;
5382 if (c >= '0' && c <= '9')
5383 chr += c - '0';
5384 else if (c >= 'a' && c <= 'f')
5385 chr += 10 + c - 'a';
5386 else
5387 chr += 10 + c - 'A';
5388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005389 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005390 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005391 /* _decoding_error will have already written into the
5392 target buffer. */
5393 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005394 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005395 /* when we get here, chr is a 32-bit unicode character */
5396 if (chr <= 0xffff)
5397 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005398 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005399 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005400 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005401 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005402#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005403 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005404#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005405 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005406 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5407 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005408#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005409 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005411 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 errors, &errorHandler,
5414 "unicodeescape", "illegal Unicode character",
5415 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005417 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005419 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005420 break;
5421
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005423 case 'N':
5424 message = "malformed \\N character escape";
5425 if (ucnhash_CAPI == NULL) {
5426 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5428 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005429 if (ucnhash_CAPI == NULL)
5430 goto ucnhashError;
5431 }
5432 if (*s == '{') {
5433 const char *start = s+1;
5434 /* look for the closing brace */
5435 while (*s != '}' && s < end)
5436 s++;
5437 if (s > start && s < end && *s == '}') {
5438 /* found a name. look it up in the unicode database */
5439 message = "unknown Unicode character name";
5440 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5442 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005443 goto store;
5444 }
5445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005449 errors, &errorHandler,
5450 "unicodeescape", message,
5451 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005454 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005455 break;
5456
5457 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005458 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005459 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 message = "\\ at end of string";
5461 s--;
5462 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 errors, &errorHandler,
5466 "unicodeescape", message,
5467 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005469 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005471 }
5472 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005473 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5474 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005475 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005476 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 /* Ensure the length prediction worked in case of ASCII strings */
5482 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5483
Victor Stinnerfe226c02011-10-03 03:52:20 +02005484 if (kind == PyUnicode_WCHAR_KIND)
5485 {
5486 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5487 goto onError;
5488 if (PyUnicode_READY(v) == -1)
5489 goto onError;
5490 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005491 Py_XDECREF(errorHandler);
5492 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005494
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005496 PyErr_SetString(
5497 PyExc_UnicodeError,
5498 "\\N escapes not supported (can't load unicodedata module)"
5499 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005500 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 Py_XDECREF(errorHandler);
5502 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005503 return NULL;
5504
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005507 Py_XDECREF(errorHandler);
5508 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 return NULL;
5510}
5511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005512#undef WRITE_ASCII_OR_WSTR
5513#undef WRITE_WSTR
5514
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515/* Return a Unicode-Escape string version of the Unicode object.
5516
5517 If quotes is true, the string is enclosed in u"" or u'' quotes as
5518 appropriate.
5519
5520*/
5521
Walter Dörwald79e913e2007-05-12 11:08:06 +00005522static const char *hexdigits = "0123456789abcdef";
5523
Alexander Belopolsky40018472011-02-26 01:02:56 +00005524PyObject *
5525PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005526 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005528 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005531#ifdef Py_UNICODE_WIDE
5532 const Py_ssize_t expandsize = 10;
5533#else
5534 const Py_ssize_t expandsize = 6;
5535#endif
5536
Thomas Wouters89f507f2006-12-13 04:49:30 +00005537 /* XXX(nnorwitz): rather than over-allocating, it would be
5538 better to choose a different scheme. Perhaps scan the
5539 first N-chars of the string and allocate based on that size.
5540 */
5541 /* Initial allocation is based on the longest-possible unichr
5542 escape.
5543
5544 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5545 unichr, so in this case it's the longest unichr escape. In
5546 narrow (UTF-16) builds this is five chars per source unichr
5547 since there are two unichrs in the surrogate pair, so in narrow
5548 (UTF-16) builds it's not the longest unichr escape.
5549
5550 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5551 so in the narrow (UTF-16) build case it's the longest unichr
5552 escape.
5553 */
5554
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005555 if (size == 0)
5556 return PyBytes_FromStringAndSize(NULL, 0);
5557
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005558 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005560
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005561 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 2
5563 + expandsize*size
5564 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 if (repr == NULL)
5566 return NULL;
5567
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005568 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 while (size-- > 0) {
5571 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005572
Walter Dörwald79e913e2007-05-12 11:08:06 +00005573 /* Escape backslashes */
5574 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575 *p++ = '\\';
5576 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005577 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005578 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005579
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005580#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005581 /* Map 21-bit characters to '\U00xxxxxx' */
5582 else if (ch >= 0x10000) {
5583 *p++ = '\\';
5584 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005585 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5586 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5587 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5588 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5589 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5590 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5591 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5592 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005594 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005595#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5597 else if (ch >= 0xD800 && ch < 0xDC00) {
5598 Py_UNICODE ch2;
5599 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005600
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 ch2 = *s++;
5602 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005603 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5605 *p++ = '\\';
5606 *p++ = 'U';
5607 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5608 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5609 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5610 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5611 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5612 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5613 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5614 *p++ = hexdigits[ucs & 0x0000000F];
5615 continue;
5616 }
5617 /* Fall through: isolated surrogates are copied as-is */
5618 s--;
5619 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005620 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005621#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005622
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005624 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 *p++ = '\\';
5626 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005627 *p++ = hexdigits[(ch >> 12) & 0x000F];
5628 *p++ = hexdigits[(ch >> 8) & 0x000F];
5629 *p++ = hexdigits[(ch >> 4) & 0x000F];
5630 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005632
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005633 /* Map special whitespace to '\t', \n', '\r' */
5634 else if (ch == '\t') {
5635 *p++ = '\\';
5636 *p++ = 't';
5637 }
5638 else if (ch == '\n') {
5639 *p++ = '\\';
5640 *p++ = 'n';
5641 }
5642 else if (ch == '\r') {
5643 *p++ = '\\';
5644 *p++ = 'r';
5645 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005646
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005647 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005648 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005650 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005651 *p++ = hexdigits[(ch >> 4) & 0x000F];
5652 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005653 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005654
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 /* Copy everything else as-is */
5656 else
5657 *p++ = (char) ch;
5658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005660 assert(p - PyBytes_AS_STRING(repr) > 0);
5661 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5662 return NULL;
5663 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664}
5665
Alexander Belopolsky40018472011-02-26 01:02:56 +00005666PyObject *
5667PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005669 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 if (!PyUnicode_Check(unicode)) {
5671 PyErr_BadArgument();
5672 return NULL;
5673 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005674 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5675 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005676 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677}
5678
5679/* --- Raw Unicode Escape Codec ------------------------------------------- */
5680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
5682PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005683 Py_ssize_t size,
5684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005687 Py_ssize_t startinpos;
5688 Py_ssize_t endinpos;
5689 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 const char *end;
5693 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 PyObject *errorHandler = NULL;
5695 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* Escaped strings will always be longer than the resulting
5698 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 length after conversion to the true value. (But decoding error
5700 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 v = _PyUnicode_New(size);
5702 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 end = s + size;
5708 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 unsigned char c;
5710 Py_UCS4 x;
5711 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005712 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* Non-escape characters are interpreted as Unicode ordinals */
5715 if (*s != '\\') {
5716 *p++ = (unsigned char)*s++;
5717 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005718 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 startinpos = s-starts;
5720
5721 /* \u-escapes are only interpreted iff the number of leading
5722 backslashes if odd */
5723 bs = s;
5724 for (;s < end;) {
5725 if (*s != '\\')
5726 break;
5727 *p++ = (unsigned char)*s++;
5728 }
5729 if (((s - bs) & 1) == 0 ||
5730 s >= end ||
5731 (*s != 'u' && *s != 'U')) {
5732 continue;
5733 }
5734 p--;
5735 count = *s=='u' ? 4 : 8;
5736 s++;
5737
5738 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5739 outpos = p-PyUnicode_AS_UNICODE(v);
5740 for (x = 0, i = 0; i < count; ++i, ++s) {
5741 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005742 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 endinpos = s-starts;
5744 if (unicode_decode_call_errorhandler(
5745 errors, &errorHandler,
5746 "rawunicodeescape", "truncated \\uXXXX",
5747 &starts, &end, &startinpos, &endinpos, &exc, &s,
5748 &v, &outpos, &p))
5749 goto onError;
5750 goto nextByte;
5751 }
5752 x = (x<<4) & ~0xF;
5753 if (c >= '0' && c <= '9')
5754 x += c - '0';
5755 else if (c >= 'a' && c <= 'f')
5756 x += 10 + c - 'a';
5757 else
5758 x += 10 + c - 'A';
5759 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005760 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 /* UCS-2 character */
5762 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005763 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 /* UCS-4 character. Either store directly, or as
5765 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005766#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005768#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 x -= 0x10000L;
5770 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5771 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005772#endif
5773 } else {
5774 endinpos = s-starts;
5775 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005776 if (unicode_decode_call_errorhandler(
5777 errors, &errorHandler,
5778 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 &starts, &end, &startinpos, &endinpos, &exc, &s,
5780 &v, &outpos, &p))
5781 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 nextByte:
5784 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005786 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 Py_XDECREF(errorHandler);
5789 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005790 if (PyUnicode_READY(v) == -1) {
5791 Py_DECREF(v);
5792 return NULL;
5793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005795
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 Py_XDECREF(errorHandler);
5799 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 return NULL;
5801}
5802
Alexander Belopolsky40018472011-02-26 01:02:56 +00005803PyObject *
5804PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005805 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005807 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808 char *p;
5809 char *q;
5810
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005811#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005812 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005813#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005814 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005815#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005816
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005817 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005819
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005820 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821 if (repr == NULL)
5822 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005823 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005824 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005826 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 while (size-- > 0) {
5828 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005829#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 /* Map 32-bit characters to '\Uxxxxxxxx' */
5831 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005832 *p++ = '\\';
5833 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005834 *p++ = hexdigits[(ch >> 28) & 0xf];
5835 *p++ = hexdigits[(ch >> 24) & 0xf];
5836 *p++ = hexdigits[(ch >> 20) & 0xf];
5837 *p++ = hexdigits[(ch >> 16) & 0xf];
5838 *p++ = hexdigits[(ch >> 12) & 0xf];
5839 *p++ = hexdigits[(ch >> 8) & 0xf];
5840 *p++ = hexdigits[(ch >> 4) & 0xf];
5841 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005842 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005843 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005844#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5846 if (ch >= 0xD800 && ch < 0xDC00) {
5847 Py_UNICODE ch2;
5848 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005849
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 ch2 = *s++;
5851 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005852 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5854 *p++ = '\\';
5855 *p++ = 'U';
5856 *p++ = hexdigits[(ucs >> 28) & 0xf];
5857 *p++ = hexdigits[(ucs >> 24) & 0xf];
5858 *p++ = hexdigits[(ucs >> 20) & 0xf];
5859 *p++ = hexdigits[(ucs >> 16) & 0xf];
5860 *p++ = hexdigits[(ucs >> 12) & 0xf];
5861 *p++ = hexdigits[(ucs >> 8) & 0xf];
5862 *p++ = hexdigits[(ucs >> 4) & 0xf];
5863 *p++ = hexdigits[ucs & 0xf];
5864 continue;
5865 }
5866 /* Fall through: isolated surrogates are copied as-is */
5867 s--;
5868 size++;
5869 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005870#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 /* Map 16-bit characters to '\uxxxx' */
5872 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 *p++ = '\\';
5874 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005875 *p++ = hexdigits[(ch >> 12) & 0xf];
5876 *p++ = hexdigits[(ch >> 8) & 0xf];
5877 *p++ = hexdigits[(ch >> 4) & 0xf];
5878 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 /* Copy everything else as-is */
5881 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 *p++ = (char) ch;
5883 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005884 size = p - q;
5885
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005886 assert(size > 0);
5887 if (_PyBytes_Resize(&repr, size) < 0)
5888 return NULL;
5889 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890}
5891
Alexander Belopolsky40018472011-02-26 01:02:56 +00005892PyObject *
5893PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005895 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005897 PyErr_BadArgument();
5898 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005900 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5901 PyUnicode_GET_SIZE(unicode));
5902
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005903 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904}
5905
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005906/* --- Unicode Internal Codec ------------------------------------------- */
5907
Alexander Belopolsky40018472011-02-26 01:02:56 +00005908PyObject *
5909_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005910 Py_ssize_t size,
5911 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005912{
5913 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005914 Py_ssize_t startinpos;
5915 Py_ssize_t endinpos;
5916 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005917 PyUnicodeObject *v;
5918 Py_UNICODE *p;
5919 const char *end;
5920 const char *reason;
5921 PyObject *errorHandler = NULL;
5922 PyObject *exc = NULL;
5923
Neal Norwitzd43069c2006-01-08 01:12:10 +00005924#ifdef Py_UNICODE_WIDE
5925 Py_UNICODE unimax = PyUnicode_GetMax();
5926#endif
5927
Thomas Wouters89f507f2006-12-13 04:49:30 +00005928 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005929 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5930 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005932 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5933 as string was created with the old API. */
5934 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005936 p = PyUnicode_AS_UNICODE(v);
5937 end = s + size;
5938
5939 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005940 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005941 /* We have to sanity check the raw data, otherwise doom looms for
5942 some malformed UCS-4 data. */
5943 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005944#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005945 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005946#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005947 end-s < Py_UNICODE_SIZE
5948 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005950 startinpos = s - starts;
5951 if (end-s < Py_UNICODE_SIZE) {
5952 endinpos = end-starts;
5953 reason = "truncated input";
5954 }
5955 else {
5956 endinpos = s - starts + Py_UNICODE_SIZE;
5957 reason = "illegal code point (> 0x10FFFF)";
5958 }
5959 outpos = p - PyUnicode_AS_UNICODE(v);
5960 if (unicode_decode_call_errorhandler(
5961 errors, &errorHandler,
5962 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005963 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005964 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005965 goto onError;
5966 }
5967 }
5968 else {
5969 p++;
5970 s += Py_UNICODE_SIZE;
5971 }
5972 }
5973
Victor Stinnerfe226c02011-10-03 03:52:20 +02005974 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005975 goto onError;
5976 Py_XDECREF(errorHandler);
5977 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005978 if (PyUnicode_READY(v) == -1) {
5979 Py_DECREF(v);
5980 return NULL;
5981 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 return (PyObject *)v;
5983
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005985 Py_XDECREF(v);
5986 Py_XDECREF(errorHandler);
5987 Py_XDECREF(exc);
5988 return NULL;
5989}
5990
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991/* --- Latin-1 Codec ------------------------------------------------------ */
5992
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993PyObject *
5994PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005995 Py_ssize_t size,
5996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005999 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000}
6001
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006003static void
6004make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006005 const char *encoding,
6006 const Py_UNICODE *unicode, Py_ssize_t size,
6007 Py_ssize_t startpos, Py_ssize_t endpos,
6008 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 *exceptionObject = PyUnicodeEncodeError_Create(
6012 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 }
6014 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6016 goto onError;
6017 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6018 goto onError;
6019 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6020 goto onError;
6021 return;
6022 onError:
6023 Py_DECREF(*exceptionObject);
6024 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 }
6026}
6027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006028/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006029static void
6030raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006031 const char *encoding,
6032 const Py_UNICODE *unicode, Py_ssize_t size,
6033 Py_ssize_t startpos, Py_ssize_t endpos,
6034 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035{
6036 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040}
6041
6042/* error handling callback helper:
6043 build arguments, call the callback and check the arguments,
6044 put the result into newpos and return the replacement string, which
6045 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006046static PyObject *
6047unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006048 PyObject **errorHandler,
6049 const char *encoding, const char *reason,
6050 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6051 Py_ssize_t startpos, Py_ssize_t endpos,
6052 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006054 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006055
6056 PyObject *restuple;
6057 PyObject *resunicode;
6058
6059 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 }
6064
6065 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069
6070 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006075 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 Py_DECREF(restuple);
6077 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006079 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 &resunicode, newpos)) {
6081 Py_DECREF(restuple);
6082 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006084 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6085 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6086 Py_DECREF(restuple);
6087 return NULL;
6088 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006091 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6093 Py_DECREF(restuple);
6094 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096 Py_INCREF(resunicode);
6097 Py_DECREF(restuple);
6098 return resunicode;
6099}
6100
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101static PyObject *
6102unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006103 Py_ssize_t size,
6104 const char *errors,
6105 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106{
6107 /* output object */
6108 PyObject *res;
6109 /* pointers to the beginning and end+1 of input */
6110 const Py_UNICODE *startp = p;
6111 const Py_UNICODE *endp = p + size;
6112 /* pointer to the beginning of the unencodable characters */
6113 /* const Py_UNICODE *badp = NULL; */
6114 /* pointer into the output */
6115 char *str;
6116 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006117 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006118 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6119 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 PyObject *errorHandler = NULL;
6121 PyObject *exc = NULL;
6122 /* the following variable is used for caching string comparisons
6123 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6124 int known_errorHandler = -1;
6125
6126 /* allocate enough for a simple encoding without
6127 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006128 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006129 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006130 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006132 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006133 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 ressize = size;
6135
6136 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* can we encode this? */
6140 if (c<limit) {
6141 /* no overflow check, because we know that the space is enough */
6142 *str++ = (char)c;
6143 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006144 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 else {
6146 Py_ssize_t unicodepos = p-startp;
6147 Py_ssize_t requiredsize;
6148 PyObject *repunicode;
6149 Py_ssize_t repsize;
6150 Py_ssize_t newpos;
6151 Py_ssize_t respos;
6152 Py_UNICODE *uni2;
6153 /* startpos for collecting unencodable chars */
6154 const Py_UNICODE *collstart = p;
6155 const Py_UNICODE *collend = p;
6156 /* find all unecodable characters */
6157 while ((collend < endp) && ((*collend)>=limit))
6158 ++collend;
6159 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6160 if (known_errorHandler==-1) {
6161 if ((errors==NULL) || (!strcmp(errors, "strict")))
6162 known_errorHandler = 1;
6163 else if (!strcmp(errors, "replace"))
6164 known_errorHandler = 2;
6165 else if (!strcmp(errors, "ignore"))
6166 known_errorHandler = 3;
6167 else if (!strcmp(errors, "xmlcharrefreplace"))
6168 known_errorHandler = 4;
6169 else
6170 known_errorHandler = 0;
6171 }
6172 switch (known_errorHandler) {
6173 case 1: /* strict */
6174 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6175 goto onError;
6176 case 2: /* replace */
6177 while (collstart++<collend)
6178 *str++ = '?'; /* fall through */
6179 case 3: /* ignore */
6180 p = collend;
6181 break;
6182 case 4: /* xmlcharrefreplace */
6183 respos = str - PyBytes_AS_STRING(res);
6184 /* determine replacement size (temporarily (mis)uses p) */
6185 for (p = collstart, repsize = 0; p < collend; ++p) {
6186 if (*p<10)
6187 repsize += 2+1+1;
6188 else if (*p<100)
6189 repsize += 2+2+1;
6190 else if (*p<1000)
6191 repsize += 2+3+1;
6192 else if (*p<10000)
6193 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006194#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 else
6196 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006197#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 else if (*p<100000)
6199 repsize += 2+5+1;
6200 else if (*p<1000000)
6201 repsize += 2+6+1;
6202 else
6203 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006204#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 }
6206 requiredsize = respos+repsize+(endp-collend);
6207 if (requiredsize > ressize) {
6208 if (requiredsize<2*ressize)
6209 requiredsize = 2*ressize;
6210 if (_PyBytes_Resize(&res, requiredsize))
6211 goto onError;
6212 str = PyBytes_AS_STRING(res) + respos;
6213 ressize = requiredsize;
6214 }
6215 /* generate replacement (temporarily (mis)uses p) */
6216 for (p = collstart; p < collend; ++p) {
6217 str += sprintf(str, "&#%d;", (int)*p);
6218 }
6219 p = collend;
6220 break;
6221 default:
6222 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6223 encoding, reason, startp, size, &exc,
6224 collstart-startp, collend-startp, &newpos);
6225 if (repunicode == NULL)
6226 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006227 if (PyBytes_Check(repunicode)) {
6228 /* Directly copy bytes result to output. */
6229 repsize = PyBytes_Size(repunicode);
6230 if (repsize > 1) {
6231 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006232 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006233 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6234 Py_DECREF(repunicode);
6235 goto onError;
6236 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006237 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006238 ressize += repsize-1;
6239 }
6240 memcpy(str, PyBytes_AsString(repunicode), repsize);
6241 str += repsize;
6242 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006243 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006244 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006245 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 /* need more space? (at least enough for what we
6247 have+the replacement+the rest of the string, so
6248 we won't have to check space for encodable characters) */
6249 respos = str - PyBytes_AS_STRING(res);
6250 repsize = PyUnicode_GET_SIZE(repunicode);
6251 requiredsize = respos+repsize+(endp-collend);
6252 if (requiredsize > ressize) {
6253 if (requiredsize<2*ressize)
6254 requiredsize = 2*ressize;
6255 if (_PyBytes_Resize(&res, requiredsize)) {
6256 Py_DECREF(repunicode);
6257 goto onError;
6258 }
6259 str = PyBytes_AS_STRING(res) + respos;
6260 ressize = requiredsize;
6261 }
6262 /* check if there is anything unencodable in the replacement
6263 and copy it to the output */
6264 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6265 c = *uni2;
6266 if (c >= limit) {
6267 raise_encode_exception(&exc, encoding, startp, size,
6268 unicodepos, unicodepos+1, reason);
6269 Py_DECREF(repunicode);
6270 goto onError;
6271 }
6272 *str = (char)c;
6273 }
6274 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006275 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006277 }
6278 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006279 /* Resize if we allocated to much */
6280 size = str - PyBytes_AS_STRING(res);
6281 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006282 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006283 if (_PyBytes_Resize(&res, size) < 0)
6284 goto onError;
6285 }
6286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 Py_XDECREF(errorHandler);
6288 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006289 return res;
6290
6291 onError:
6292 Py_XDECREF(res);
6293 Py_XDECREF(errorHandler);
6294 Py_XDECREF(exc);
6295 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296}
6297
Alexander Belopolsky40018472011-02-26 01:02:56 +00006298PyObject *
6299PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006300 Py_ssize_t size,
6301 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304}
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006307_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
6309 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 PyErr_BadArgument();
6311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006313 if (PyUnicode_READY(unicode) == -1)
6314 return NULL;
6315 /* Fast path: if it is a one-byte string, construct
6316 bytes object directly. */
6317 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6318 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6319 PyUnicode_GET_LENGTH(unicode));
6320 /* Non-Latin-1 characters present. Defer to above function to
6321 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006324 errors);
6325}
6326
6327PyObject*
6328PyUnicode_AsLatin1String(PyObject *unicode)
6329{
6330 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331}
6332
6333/* --- 7-bit ASCII Codec -------------------------------------------------- */
6334
Alexander Belopolsky40018472011-02-26 01:02:56 +00006335PyObject *
6336PyUnicode_DecodeASCII(const char *s,
6337 Py_ssize_t size,
6338 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 PyUnicodeObject *v;
6342 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006343 Py_ssize_t startinpos;
6344 Py_ssize_t endinpos;
6345 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006347 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 PyObject *errorHandler = NULL;
6349 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006350 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006351
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006353 if (size == 1 && *(unsigned char*)s < 128)
6354 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6355
6356 /* Fast path. Assume the input actually *is* ASCII, and allocate
6357 a single-block Unicode object with that assumption. If there is
6358 an error, drop the object and start over. */
6359 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6360 if (v == NULL)
6361 goto onError;
6362 d = PyUnicode_1BYTE_DATA(v);
6363 for (i = 0; i < size; i++) {
6364 unsigned char ch = ((unsigned char*)s)[i];
6365 if (ch < 128)
6366 d[i] = ch;
6367 else
6368 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006370 if (i == size)
6371 return (PyObject*)v;
6372 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006373
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 v = _PyUnicode_New(size);
6375 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006380 e = s + size;
6381 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 register unsigned char c = (unsigned char)*s;
6383 if (c < 128) {
6384 *p++ = c;
6385 ++s;
6386 }
6387 else {
6388 startinpos = s-starts;
6389 endinpos = startinpos + 1;
6390 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6391 if (unicode_decode_call_errorhandler(
6392 errors, &errorHandler,
6393 "ascii", "ordinal not in range(128)",
6394 &starts, &e, &startinpos, &endinpos, &exc, &s,
6395 &v, &outpos, &p))
6396 goto onError;
6397 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006399 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006400 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 Py_XDECREF(errorHandler);
6403 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006404 if (PyUnicode_READY(v) == -1) {
6405 Py_DECREF(v);
6406 return NULL;
6407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006409
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 Py_XDECREF(errorHandler);
6413 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 return NULL;
6415}
6416
Alexander Belopolsky40018472011-02-26 01:02:56 +00006417PyObject *
6418PyUnicode_EncodeASCII(const Py_UNICODE *p,
6419 Py_ssize_t size,
6420 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423}
6424
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006426_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
6428 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 PyErr_BadArgument();
6430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006432 if (PyUnicode_READY(unicode) == -1)
6433 return NULL;
6434 /* Fast path: if it is an ASCII-only string, construct bytes object
6435 directly. Else defer to above function to raise the exception. */
6436 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6437 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6438 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006441 errors);
6442}
6443
6444PyObject *
6445PyUnicode_AsASCIIString(PyObject *unicode)
6446{
6447 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448}
6449
Victor Stinner99b95382011-07-04 14:23:54 +02006450#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006451
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006452/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006453
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006454#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006455#define NEED_RETRY
6456#endif
6457
6458/* XXX This code is limited to "true" double-byte encodings, as
6459 a) it assumes an incomplete character consists of a single byte, and
6460 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006462
Alexander Belopolsky40018472011-02-26 01:02:56 +00006463static int
6464is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006465{
6466 const char *curr = s + offset;
6467
6468 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 const char *prev = CharPrev(s, curr);
6470 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006471 }
6472 return 0;
6473}
6474
6475/*
6476 * Decode MBCS string into unicode object. If 'final' is set, converts
6477 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6478 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006479static int
6480decode_mbcs(PyUnicodeObject **v,
6481 const char *s, /* MBCS string */
6482 int size, /* sizeof MBCS string */
6483 int final,
6484 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006485{
6486 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006487 Py_ssize_t n;
6488 DWORD usize;
6489 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006490
6491 assert(size >= 0);
6492
Victor Stinner554f3f02010-06-16 23:33:54 +00006493 /* check and handle 'errors' arg */
6494 if (errors==NULL || strcmp(errors, "strict")==0)
6495 flags = MB_ERR_INVALID_CHARS;
6496 else if (strcmp(errors, "ignore")==0)
6497 flags = 0;
6498 else {
6499 PyErr_Format(PyExc_ValueError,
6500 "mbcs encoding does not support errors='%s'",
6501 errors);
6502 return -1;
6503 }
6504
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006505 /* Skip trailing lead-byte unless 'final' is set */
6506 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006508
6509 /* First get the size of the result */
6510 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006511 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6512 if (usize==0)
6513 goto mbcs_decode_error;
6514 } else
6515 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006516
6517 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 /* Create unicode object */
6519 *v = _PyUnicode_New(usize);
6520 if (*v == NULL)
6521 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006522 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006523 }
6524 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 /* Extend unicode object */
6526 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006527 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006529 }
6530
6531 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006532 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006534 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6535 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006537 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006538 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006539
6540mbcs_decode_error:
6541 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6542 we raise a UnicodeDecodeError - else it is a 'generic'
6543 windows error
6544 */
6545 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6546 /* Ideally, we should get reason from FormatMessage - this
6547 is the Windows 2000 English version of the message
6548 */
6549 PyObject *exc = NULL;
6550 const char *reason = "No mapping for the Unicode character exists "
6551 "in the target multi-byte code page.";
6552 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6553 if (exc != NULL) {
6554 PyCodec_StrictErrors(exc);
6555 Py_DECREF(exc);
6556 }
6557 } else {
6558 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6559 }
6560 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006561}
6562
Alexander Belopolsky40018472011-02-26 01:02:56 +00006563PyObject *
6564PyUnicode_DecodeMBCSStateful(const char *s,
6565 Py_ssize_t size,
6566 const char *errors,
6567 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006568{
6569 PyUnicodeObject *v = NULL;
6570 int done;
6571
6572 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006574
6575#ifdef NEED_RETRY
6576 retry:
6577 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006578 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579 else
6580#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006581 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006582
6583 if (done < 0) {
6584 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006586 }
6587
6588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590
6591#ifdef NEED_RETRY
6592 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 s += done;
6594 size -= done;
6595 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006596 }
6597#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006598 if (PyUnicode_READY(v) == -1) {
6599 Py_DECREF(v);
6600 return NULL;
6601 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602 return (PyObject *)v;
6603}
6604
Alexander Belopolsky40018472011-02-26 01:02:56 +00006605PyObject *
6606PyUnicode_DecodeMBCS(const char *s,
6607 Py_ssize_t size,
6608 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006609{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006610 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6611}
6612
6613/*
6614 * Convert unicode into string object (MBCS).
6615 * Returns 0 if succeed, -1 otherwise.
6616 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006617static int
6618encode_mbcs(PyObject **repr,
6619 const Py_UNICODE *p, /* unicode */
6620 int size, /* size of unicode */
6621 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006622{
Victor Stinner554f3f02010-06-16 23:33:54 +00006623 BOOL usedDefaultChar = FALSE;
6624 BOOL *pusedDefaultChar;
6625 int mbcssize;
6626 Py_ssize_t n;
6627 PyObject *exc = NULL;
6628 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006629
6630 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006631
Victor Stinner554f3f02010-06-16 23:33:54 +00006632 /* check and handle 'errors' arg */
6633 if (errors==NULL || strcmp(errors, "strict")==0) {
6634 flags = WC_NO_BEST_FIT_CHARS;
6635 pusedDefaultChar = &usedDefaultChar;
6636 } else if (strcmp(errors, "replace")==0) {
6637 flags = 0;
6638 pusedDefaultChar = NULL;
6639 } else {
6640 PyErr_Format(PyExc_ValueError,
6641 "mbcs encoding does not support errors='%s'",
6642 errors);
6643 return -1;
6644 }
6645
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006646 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006647 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006648 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6649 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 if (mbcssize == 0) {
6651 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6652 return -1;
6653 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006654 /* If we used a default char, then we failed! */
6655 if (pusedDefaultChar && *pusedDefaultChar)
6656 goto mbcs_encode_error;
6657 } else {
6658 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006659 }
6660
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 /* Create string object */
6663 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6664 if (*repr == NULL)
6665 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006666 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667 }
6668 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 /* Extend string object */
6670 n = PyBytes_Size(*repr);
6671 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6672 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006673 }
6674
6675 /* Do the conversion */
6676 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006678 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6679 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6681 return -1;
6682 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006683 if (pusedDefaultChar && *pusedDefaultChar)
6684 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006685 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006687
6688mbcs_encode_error:
6689 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6690 Py_XDECREF(exc);
6691 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006692}
6693
Alexander Belopolsky40018472011-02-26 01:02:56 +00006694PyObject *
6695PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6696 Py_ssize_t size,
6697 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006698{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699 PyObject *repr = NULL;
6700 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006701
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006705 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706 else
6707#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006708 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006709
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 Py_XDECREF(repr);
6712 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006713 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714
6715#ifdef NEED_RETRY
6716 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 p += INT_MAX;
6718 size -= INT_MAX;
6719 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720 }
6721#endif
6722
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006723 return repr;
6724}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006725
Alexander Belopolsky40018472011-02-26 01:02:56 +00006726PyObject *
6727PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006728{
6729 if (!PyUnicode_Check(unicode)) {
6730 PyErr_BadArgument();
6731 return NULL;
6732 }
6733 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 PyUnicode_GET_SIZE(unicode),
6735 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006736}
6737
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006738#undef NEED_RETRY
6739
Victor Stinner99b95382011-07-04 14:23:54 +02006740#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006741
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742/* --- Character Mapping Codec -------------------------------------------- */
6743
Alexander Belopolsky40018472011-02-26 01:02:56 +00006744PyObject *
6745PyUnicode_DecodeCharmap(const char *s,
6746 Py_ssize_t size,
6747 PyObject *mapping,
6748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006751 Py_ssize_t startinpos;
6752 Py_ssize_t endinpos;
6753 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 PyUnicodeObject *v;
6756 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006757 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 PyObject *errorHandler = NULL;
6759 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006760 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006762
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 /* Default to Latin-1 */
6764 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766
6767 v = _PyUnicode_New(size);
6768 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006774 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 mapstring = PyUnicode_AS_UNICODE(mapping);
6776 maplen = PyUnicode_GET_SIZE(mapping);
6777 while (s < e) {
6778 unsigned char ch = *s;
6779 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 if (ch < maplen)
6782 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 if (x == 0xfffe) {
6785 /* undefined mapping */
6786 outpos = p-PyUnicode_AS_UNICODE(v);
6787 startinpos = s-starts;
6788 endinpos = startinpos+1;
6789 if (unicode_decode_call_errorhandler(
6790 errors, &errorHandler,
6791 "charmap", "character maps to <undefined>",
6792 &starts, &e, &startinpos, &endinpos, &exc, &s,
6793 &v, &outpos, &p)) {
6794 goto onError;
6795 }
6796 continue;
6797 }
6798 *p++ = x;
6799 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006800 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006801 }
6802 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 while (s < e) {
6804 unsigned char ch = *s;
6805 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6808 w = PyLong_FromLong((long)ch);
6809 if (w == NULL)
6810 goto onError;
6811 x = PyObject_GetItem(mapping, w);
6812 Py_DECREF(w);
6813 if (x == NULL) {
6814 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6815 /* No mapping found means: mapping is undefined. */
6816 PyErr_Clear();
6817 x = Py_None;
6818 Py_INCREF(x);
6819 } else
6820 goto onError;
6821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006822
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 /* Apply mapping */
6824 if (PyLong_Check(x)) {
6825 long value = PyLong_AS_LONG(x);
6826 if (value < 0 || value > 65535) {
6827 PyErr_SetString(PyExc_TypeError,
6828 "character mapping must be in range(65536)");
6829 Py_DECREF(x);
6830 goto onError;
6831 }
6832 *p++ = (Py_UNICODE)value;
6833 }
6834 else if (x == Py_None) {
6835 /* undefined mapping */
6836 outpos = p-PyUnicode_AS_UNICODE(v);
6837 startinpos = s-starts;
6838 endinpos = startinpos+1;
6839 if (unicode_decode_call_errorhandler(
6840 errors, &errorHandler,
6841 "charmap", "character maps to <undefined>",
6842 &starts, &e, &startinpos, &endinpos, &exc, &s,
6843 &v, &outpos, &p)) {
6844 Py_DECREF(x);
6845 goto onError;
6846 }
6847 Py_DECREF(x);
6848 continue;
6849 }
6850 else if (PyUnicode_Check(x)) {
6851 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006852
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 if (targetsize == 1)
6854 /* 1-1 mapping */
6855 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006856
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 else if (targetsize > 1) {
6858 /* 1-n mapping */
6859 if (targetsize > extrachars) {
6860 /* resize first */
6861 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6862 Py_ssize_t needed = (targetsize - extrachars) + \
6863 (targetsize << 2);
6864 extrachars += needed;
6865 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006866 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 PyUnicode_GET_SIZE(v) + needed) < 0) {
6868 Py_DECREF(x);
6869 goto onError;
6870 }
6871 p = PyUnicode_AS_UNICODE(v) + oldpos;
6872 }
6873 Py_UNICODE_COPY(p,
6874 PyUnicode_AS_UNICODE(x),
6875 targetsize);
6876 p += targetsize;
6877 extrachars -= targetsize;
6878 }
6879 /* 1-0 mapping: skip the character */
6880 }
6881 else {
6882 /* wrong return value */
6883 PyErr_SetString(PyExc_TypeError,
6884 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006885 Py_DECREF(x);
6886 goto onError;
6887 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 Py_DECREF(x);
6889 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 }
6892 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006893 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006895 Py_XDECREF(errorHandler);
6896 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006897 if (PyUnicode_READY(v) == -1) {
6898 Py_DECREF(v);
6899 return NULL;
6900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006902
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006904 Py_XDECREF(errorHandler);
6905 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 Py_XDECREF(v);
6907 return NULL;
6908}
6909
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006910/* Charmap encoding: the lookup table */
6911
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 PyObject_HEAD
6914 unsigned char level1[32];
6915 int count2, count3;
6916 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006917};
6918
6919static PyObject*
6920encoding_map_size(PyObject *obj, PyObject* args)
6921{
6922 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006925}
6926
6927static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 PyDoc_STR("Return the size (in bytes) of this object") },
6930 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006931};
6932
6933static void
6934encoding_map_dealloc(PyObject* o)
6935{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006936 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006937}
6938
6939static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 "EncodingMap", /*tp_name*/
6942 sizeof(struct encoding_map), /*tp_basicsize*/
6943 0, /*tp_itemsize*/
6944 /* methods */
6945 encoding_map_dealloc, /*tp_dealloc*/
6946 0, /*tp_print*/
6947 0, /*tp_getattr*/
6948 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006949 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006950 0, /*tp_repr*/
6951 0, /*tp_as_number*/
6952 0, /*tp_as_sequence*/
6953 0, /*tp_as_mapping*/
6954 0, /*tp_hash*/
6955 0, /*tp_call*/
6956 0, /*tp_str*/
6957 0, /*tp_getattro*/
6958 0, /*tp_setattro*/
6959 0, /*tp_as_buffer*/
6960 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6961 0, /*tp_doc*/
6962 0, /*tp_traverse*/
6963 0, /*tp_clear*/
6964 0, /*tp_richcompare*/
6965 0, /*tp_weaklistoffset*/
6966 0, /*tp_iter*/
6967 0, /*tp_iternext*/
6968 encoding_map_methods, /*tp_methods*/
6969 0, /*tp_members*/
6970 0, /*tp_getset*/
6971 0, /*tp_base*/
6972 0, /*tp_dict*/
6973 0, /*tp_descr_get*/
6974 0, /*tp_descr_set*/
6975 0, /*tp_dictoffset*/
6976 0, /*tp_init*/
6977 0, /*tp_alloc*/
6978 0, /*tp_new*/
6979 0, /*tp_free*/
6980 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006981};
6982
6983PyObject*
6984PyUnicode_BuildEncodingMap(PyObject* string)
6985{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006986 PyObject *result;
6987 struct encoding_map *mresult;
6988 int i;
6989 int need_dict = 0;
6990 unsigned char level1[32];
6991 unsigned char level2[512];
6992 unsigned char *mlevel1, *mlevel2, *mlevel3;
6993 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006994 int kind;
6995 void *data;
6996 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006998 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006999 PyErr_BadArgument();
7000 return NULL;
7001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007002 kind = PyUnicode_KIND(string);
7003 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007004 memset(level1, 0xFF, sizeof level1);
7005 memset(level2, 0xFF, sizeof level2);
7006
7007 /* If there isn't a one-to-one mapping of NULL to \0,
7008 or if there are non-BMP characters, we need to use
7009 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007010 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007011 need_dict = 1;
7012 for (i = 1; i < 256; i++) {
7013 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 ch = PyUnicode_READ(kind, data, i);
7015 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007016 need_dict = 1;
7017 break;
7018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007020 /* unmapped character */
7021 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007022 l1 = ch >> 11;
7023 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007024 if (level1[l1] == 0xFF)
7025 level1[l1] = count2++;
7026 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007027 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007028 }
7029
7030 if (count2 >= 0xFF || count3 >= 0xFF)
7031 need_dict = 1;
7032
7033 if (need_dict) {
7034 PyObject *result = PyDict_New();
7035 PyObject *key, *value;
7036 if (!result)
7037 return NULL;
7038 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007039 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007040 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007041 if (!key || !value)
7042 goto failed1;
7043 if (PyDict_SetItem(result, key, value) == -1)
7044 goto failed1;
7045 Py_DECREF(key);
7046 Py_DECREF(value);
7047 }
7048 return result;
7049 failed1:
7050 Py_XDECREF(key);
7051 Py_XDECREF(value);
7052 Py_DECREF(result);
7053 return NULL;
7054 }
7055
7056 /* Create a three-level trie */
7057 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7058 16*count2 + 128*count3 - 1);
7059 if (!result)
7060 return PyErr_NoMemory();
7061 PyObject_Init(result, &EncodingMapType);
7062 mresult = (struct encoding_map*)result;
7063 mresult->count2 = count2;
7064 mresult->count3 = count3;
7065 mlevel1 = mresult->level1;
7066 mlevel2 = mresult->level23;
7067 mlevel3 = mresult->level23 + 16*count2;
7068 memcpy(mlevel1, level1, 32);
7069 memset(mlevel2, 0xFF, 16*count2);
7070 memset(mlevel3, 0, 128*count3);
7071 count3 = 0;
7072 for (i = 1; i < 256; i++) {
7073 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007074 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007075 /* unmapped character */
7076 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007077 o1 = PyUnicode_READ(kind, data, i)>>11;
7078 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007079 i2 = 16*mlevel1[o1] + o2;
7080 if (mlevel2[i2] == 0xFF)
7081 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007082 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007083 i3 = 128*mlevel2[i2] + o3;
7084 mlevel3[i3] = i;
7085 }
7086 return result;
7087}
7088
7089static int
7090encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7091{
7092 struct encoding_map *map = (struct encoding_map*)mapping;
7093 int l1 = c>>11;
7094 int l2 = (c>>7) & 0xF;
7095 int l3 = c & 0x7F;
7096 int i;
7097
7098#ifdef Py_UNICODE_WIDE
7099 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007101 }
7102#endif
7103 if (c == 0)
7104 return 0;
7105 /* level 1*/
7106 i = map->level1[l1];
7107 if (i == 0xFF) {
7108 return -1;
7109 }
7110 /* level 2*/
7111 i = map->level23[16*i+l2];
7112 if (i == 0xFF) {
7113 return -1;
7114 }
7115 /* level 3 */
7116 i = map->level23[16*map->count2 + 128*i + l3];
7117 if (i == 0) {
7118 return -1;
7119 }
7120 return i;
7121}
7122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007123/* Lookup the character ch in the mapping. If the character
7124 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007125 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007126static PyObject *
7127charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128{
Christian Heimes217cfd12007-12-02 14:31:20 +00007129 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007130 PyObject *x;
7131
7132 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134 x = PyObject_GetItem(mapping, w);
7135 Py_DECREF(w);
7136 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7138 /* No mapping found means: mapping is undefined. */
7139 PyErr_Clear();
7140 x = Py_None;
7141 Py_INCREF(x);
7142 return x;
7143 } else
7144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007146 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007148 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 long value = PyLong_AS_LONG(x);
7150 if (value < 0 || value > 255) {
7151 PyErr_SetString(PyExc_TypeError,
7152 "character mapping must be in range(256)");
7153 Py_DECREF(x);
7154 return NULL;
7155 }
7156 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007158 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 /* wrong return value */
7162 PyErr_Format(PyExc_TypeError,
7163 "character mapping must return integer, bytes or None, not %.400s",
7164 x->ob_type->tp_name);
7165 Py_DECREF(x);
7166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 }
7168}
7169
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007170static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007171charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007172{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007173 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7174 /* exponentially overallocate to minimize reallocations */
7175 if (requiredsize < 2*outsize)
7176 requiredsize = 2*outsize;
7177 if (_PyBytes_Resize(outobj, requiredsize))
7178 return -1;
7179 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007180}
7181
Benjamin Peterson14339b62009-01-31 16:36:08 +00007182typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007183 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007184} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007186 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187 space is available. Return a new reference to the object that
7188 was put in the output buffer, or Py_None, if the mapping was undefined
7189 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007190 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007191static charmapencode_result
7192charmapencode_output(Py_UNICODE c, PyObject *mapping,
7193 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007195 PyObject *rep;
7196 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007197 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198
Christian Heimes90aa7642007-12-19 02:45:37 +00007199 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007200 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007202 if (res == -1)
7203 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 if (outsize<requiredsize)
7205 if (charmapencode_resize(outobj, outpos, requiredsize))
7206 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007207 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 outstart[(*outpos)++] = (char)res;
7209 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007210 }
7211
7212 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007213 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007215 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 Py_DECREF(rep);
7217 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007218 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 if (PyLong_Check(rep)) {
7220 Py_ssize_t requiredsize = *outpos+1;
7221 if (outsize<requiredsize)
7222 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7223 Py_DECREF(rep);
7224 return enc_EXCEPTION;
7225 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007226 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007228 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 else {
7230 const char *repchars = PyBytes_AS_STRING(rep);
7231 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7232 Py_ssize_t requiredsize = *outpos+repsize;
7233 if (outsize<requiredsize)
7234 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7235 Py_DECREF(rep);
7236 return enc_EXCEPTION;
7237 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007238 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 memcpy(outstart + *outpos, repchars, repsize);
7240 *outpos += repsize;
7241 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007242 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007243 Py_DECREF(rep);
7244 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007245}
7246
7247/* handle an error in PyUnicode_EncodeCharmap
7248 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007249static int
7250charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007251 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007253 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007254 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255{
7256 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007257 Py_ssize_t repsize;
7258 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259 Py_UNICODE *uni2;
7260 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t collstartpos = *inpos;
7262 Py_ssize_t collendpos = *inpos+1;
7263 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264 char *encoding = "charmap";
7265 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007266 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007268 /* find all unencodable characters */
7269 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007270 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007271 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 int res = encoding_map_lookup(p[collendpos], mapping);
7273 if (res != -1)
7274 break;
7275 ++collendpos;
7276 continue;
7277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007278
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 rep = charmapencode_lookup(p[collendpos], mapping);
7280 if (rep==NULL)
7281 return -1;
7282 else if (rep!=Py_None) {
7283 Py_DECREF(rep);
7284 break;
7285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007286 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007288 }
7289 /* cache callback name lookup
7290 * (if not done yet, i.e. it's the first error) */
7291 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 if ((errors==NULL) || (!strcmp(errors, "strict")))
7293 *known_errorHandler = 1;
7294 else if (!strcmp(errors, "replace"))
7295 *known_errorHandler = 2;
7296 else if (!strcmp(errors, "ignore"))
7297 *known_errorHandler = 3;
7298 else if (!strcmp(errors, "xmlcharrefreplace"))
7299 *known_errorHandler = 4;
7300 else
7301 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007302 }
7303 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007304 case 1: /* strict */
7305 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7306 return -1;
7307 case 2: /* replace */
7308 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 x = charmapencode_output('?', mapping, res, respos);
7310 if (x==enc_EXCEPTION) {
7311 return -1;
7312 }
7313 else if (x==enc_FAILED) {
7314 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7315 return -1;
7316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007317 }
7318 /* fall through */
7319 case 3: /* ignore */
7320 *inpos = collendpos;
7321 break;
7322 case 4: /* xmlcharrefreplace */
7323 /* generate replacement (temporarily (mis)uses p) */
7324 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 char buffer[2+29+1+1];
7326 char *cp;
7327 sprintf(buffer, "&#%d;", (int)p[collpos]);
7328 for (cp = buffer; *cp; ++cp) {
7329 x = charmapencode_output(*cp, mapping, res, respos);
7330 if (x==enc_EXCEPTION)
7331 return -1;
7332 else if (x==enc_FAILED) {
7333 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7334 return -1;
7335 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007336 }
7337 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007338 *inpos = collendpos;
7339 break;
7340 default:
7341 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 encoding, reason, p, size, exceptionObject,
7343 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007344 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007346 if (PyBytes_Check(repunicode)) {
7347 /* Directly copy bytes result to output. */
7348 Py_ssize_t outsize = PyBytes_Size(*res);
7349 Py_ssize_t requiredsize;
7350 repsize = PyBytes_Size(repunicode);
7351 requiredsize = *respos + repsize;
7352 if (requiredsize > outsize)
7353 /* Make room for all additional bytes. */
7354 if (charmapencode_resize(res, respos, requiredsize)) {
7355 Py_DECREF(repunicode);
7356 return -1;
7357 }
7358 memcpy(PyBytes_AsString(*res) + *respos,
7359 PyBytes_AsString(repunicode), repsize);
7360 *respos += repsize;
7361 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007362 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007363 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007365 /* generate replacement */
7366 repsize = PyUnicode_GET_SIZE(repunicode);
7367 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 x = charmapencode_output(*uni2, mapping, res, respos);
7369 if (x==enc_EXCEPTION) {
7370 return -1;
7371 }
7372 else if (x==enc_FAILED) {
7373 Py_DECREF(repunicode);
7374 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7375 return -1;
7376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007377 }
7378 *inpos = newpos;
7379 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007380 }
7381 return 0;
7382}
7383
Alexander Belopolsky40018472011-02-26 01:02:56 +00007384PyObject *
7385PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7386 Py_ssize_t size,
7387 PyObject *mapping,
7388 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007390 /* output object */
7391 PyObject *res = NULL;
7392 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007393 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007394 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 PyObject *errorHandler = NULL;
7397 PyObject *exc = NULL;
7398 /* the following variable is used for caching string comparisons
7399 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7400 * 3=ignore, 4=xmlcharrefreplace */
7401 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
7403 /* Default to Latin-1 */
7404 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407 /* allocate enough for a simple encoding without
7408 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007409 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007410 if (res == NULL)
7411 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007412 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 /* try to encode it */
7417 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7418 if (x==enc_EXCEPTION) /* error */
7419 goto onError;
7420 if (x==enc_FAILED) { /* unencodable character */
7421 if (charmap_encoding_error(p, size, &inpos, mapping,
7422 &exc,
7423 &known_errorHandler, &errorHandler, errors,
7424 &res, &respos)) {
7425 goto onError;
7426 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007427 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 else
7429 /* done with this character => adjust input position */
7430 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007433 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007434 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007435 if (_PyBytes_Resize(&res, respos) < 0)
7436 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007438 Py_XDECREF(exc);
7439 Py_XDECREF(errorHandler);
7440 return res;
7441
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007443 Py_XDECREF(res);
7444 Py_XDECREF(exc);
7445 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007446 return NULL;
7447}
7448
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449PyObject *
7450PyUnicode_AsCharmapString(PyObject *unicode,
7451 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452{
7453 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 PyErr_BadArgument();
7455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456 }
7457 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 PyUnicode_GET_SIZE(unicode),
7459 mapping,
7460 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461}
7462
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007463/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007464static void
7465make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007466 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007467 Py_ssize_t startpos, Py_ssize_t endpos,
7468 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007471 *exceptionObject = _PyUnicodeTranslateError_Create(
7472 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 }
7474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7476 goto onError;
7477 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7478 goto onError;
7479 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7480 goto onError;
7481 return;
7482 onError:
7483 Py_DECREF(*exceptionObject);
7484 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 }
7486}
7487
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007488/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007489static void
7490raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007491 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007492 Py_ssize_t startpos, Py_ssize_t endpos,
7493 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007494{
7495 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007496 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007497 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007499}
7500
7501/* error handling callback helper:
7502 build arguments, call the callback and check the arguments,
7503 put the result into newpos and return the replacement string, which
7504 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007505static PyObject *
7506unicode_translate_call_errorhandler(const char *errors,
7507 PyObject **errorHandler,
7508 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007509 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007510 Py_ssize_t startpos, Py_ssize_t endpos,
7511 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007512{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007513 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007514
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007515 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007516 PyObject *restuple;
7517 PyObject *resunicode;
7518
7519 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007523 }
7524
7525 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007526 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529
7530 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007532 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007535 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 Py_DECREF(restuple);
7537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007538 }
7539 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 &resunicode, &i_newpos)) {
7541 Py_DECREF(restuple);
7542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007544 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007545 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007546 else
7547 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007548 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7550 Py_DECREF(restuple);
7551 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553 Py_INCREF(resunicode);
7554 Py_DECREF(restuple);
7555 return resunicode;
7556}
7557
7558/* Lookup the character ch in the mapping and put the result in result,
7559 which must be decrefed by the caller.
7560 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007561static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007563{
Christian Heimes217cfd12007-12-02 14:31:20 +00007564 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007565 PyObject *x;
7566
7567 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 x = PyObject_GetItem(mapping, w);
7570 Py_DECREF(w);
7571 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7573 /* No mapping found means: use 1:1 mapping. */
7574 PyErr_Clear();
7575 *result = NULL;
7576 return 0;
7577 } else
7578 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579 }
7580 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 *result = x;
7582 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007584 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 long value = PyLong_AS_LONG(x);
7586 long max = PyUnicode_GetMax();
7587 if (value < 0 || value > max) {
7588 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007589 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 Py_DECREF(x);
7591 return -1;
7592 }
7593 *result = x;
7594 return 0;
7595 }
7596 else if (PyUnicode_Check(x)) {
7597 *result = x;
7598 return 0;
7599 }
7600 else {
7601 /* wrong return value */
7602 PyErr_SetString(PyExc_TypeError,
7603 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007604 Py_DECREF(x);
7605 return -1;
7606 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007607}
7608/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 if not reallocate and adjust various state variables.
7610 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007611static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007615 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007616 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 /* exponentially overallocate to minimize reallocations */
7618 if (requiredsize < 2 * oldsize)
7619 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7621 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007623 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007624 }
7625 return 0;
7626}
7627/* lookup the character, put the result in the output string and adjust
7628 various state variables. Return a new reference to the object that
7629 was put in the output buffer in *result, or Py_None, if the mapping was
7630 undefined (in which case no character was written).
7631 The called must decref result.
7632 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007633static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007634charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7635 PyObject *mapping, Py_UCS4 **output,
7636 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007639 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7640 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007644 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 }
7646 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007648 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007650 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007651 }
7652 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007653 Py_ssize_t repsize;
7654 if (PyUnicode_READY(*res) == -1)
7655 return -1;
7656 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 if (repsize==1) {
7658 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007659 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 }
7661 else if (repsize!=0) {
7662 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007663 Py_ssize_t requiredsize = *opos +
7664 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 Py_ssize_t i;
7667 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007669 for(i = 0; i < repsize; i++)
7670 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672 }
7673 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007675 return 0;
7676}
7677
Alexander Belopolsky40018472011-02-26 01:02:56 +00007678PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007679_PyUnicode_TranslateCharmap(PyObject *input,
7680 PyObject *mapping,
7681 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007683 /* input object */
7684 char *idata;
7685 Py_ssize_t size, i;
7686 int kind;
7687 /* output buffer */
7688 Py_UCS4 *output = NULL;
7689 Py_ssize_t osize;
7690 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007691 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007692 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007693 char *reason = "character maps to <undefined>";
7694 PyObject *errorHandler = NULL;
7695 PyObject *exc = NULL;
7696 /* the following variable is used for caching string comparisons
7697 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7698 * 3=ignore, 4=xmlcharrefreplace */
7699 int known_errorHandler = -1;
7700
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 PyErr_BadArgument();
7703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007706 if (PyUnicode_READY(input) == -1)
7707 return NULL;
7708 idata = (char*)PyUnicode_DATA(input);
7709 kind = PyUnicode_KIND(input);
7710 size = PyUnicode_GET_LENGTH(input);
7711 i = 0;
7712
7713 if (size == 0) {
7714 Py_INCREF(input);
7715 return input;
7716 }
7717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007718 /* allocate enough for a simple 1:1 translation without
7719 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007720 osize = size;
7721 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7722 opos = 0;
7723 if (output == NULL) {
7724 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 /* try to encode it */
7730 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007731 if (charmaptranslate_output(input, i, mapping,
7732 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 Py_XDECREF(x);
7734 goto onError;
7735 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007736 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007738 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 else { /* untranslatable character */
7740 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7741 Py_ssize_t repsize;
7742 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007743 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007745 Py_ssize_t collstart = i;
7746 Py_ssize_t collend = i+1;
7747 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750 while (collend < size) {
7751 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 goto onError;
7753 Py_XDECREF(x);
7754 if (x!=Py_None)
7755 break;
7756 ++collend;
7757 }
7758 /* cache callback name lookup
7759 * (if not done yet, i.e. it's the first error) */
7760 if (known_errorHandler==-1) {
7761 if ((errors==NULL) || (!strcmp(errors, "strict")))
7762 known_errorHandler = 1;
7763 else if (!strcmp(errors, "replace"))
7764 known_errorHandler = 2;
7765 else if (!strcmp(errors, "ignore"))
7766 known_errorHandler = 3;
7767 else if (!strcmp(errors, "xmlcharrefreplace"))
7768 known_errorHandler = 4;
7769 else
7770 known_errorHandler = 0;
7771 }
7772 switch (known_errorHandler) {
7773 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007774 raise_translate_exception(&exc, input, collstart,
7775 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007776 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 case 2: /* replace */
7778 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007779 for (coll = collstart; coll<collend; coll++)
7780 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 /* fall through */
7782 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007783 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 break;
7785 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007786 /* generate replacement (temporarily (mis)uses i) */
7787 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 char buffer[2+29+1+1];
7789 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7791 if (charmaptranslate_makespace(&output, &osize,
7792 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 goto onError;
7794 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007795 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 break;
7799 default:
7800 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007801 reason, input, &exc,
7802 collstart, collend, &newpos);
7803 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 goto onError;
7805 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 repsize = PyUnicode_GET_LENGTH(repunicode);
7807 if (charmaptranslate_makespace(&output, &osize,
7808 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 Py_DECREF(repunicode);
7810 goto onError;
7811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007812 for (uni2 = 0; repsize-->0; ++uni2)
7813 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7814 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007816 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007817 }
7818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007819 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7820 if (!res)
7821 goto onError;
7822 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 Py_XDECREF(exc);
7824 Py_XDECREF(errorHandler);
7825 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007829 Py_XDECREF(exc);
7830 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831 return NULL;
7832}
7833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007834/* Deprecated. Use PyUnicode_Translate instead. */
7835PyObject *
7836PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7837 Py_ssize_t size,
7838 PyObject *mapping,
7839 const char *errors)
7840{
7841 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7842 if (!unicode)
7843 return NULL;
7844 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7845}
7846
Alexander Belopolsky40018472011-02-26 01:02:56 +00007847PyObject *
7848PyUnicode_Translate(PyObject *str,
7849 PyObject *mapping,
7850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851{
7852 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007853
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 str = PyUnicode_FromObject(str);
7855 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007857 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 Py_DECREF(str);
7859 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007860
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 Py_XDECREF(str);
7863 return NULL;
7864}
Tim Petersced69f82003-09-16 20:30:58 +00007865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866static Py_UCS4
7867fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7868{
7869 /* No need to call PyUnicode_READY(self) because this function is only
7870 called as a callback from fixup() which does it already. */
7871 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7872 const int kind = PyUnicode_KIND(self);
7873 void *data = PyUnicode_DATA(self);
7874 Py_UCS4 maxchar = 0, ch, fixed;
7875 Py_ssize_t i;
7876
7877 for (i = 0; i < len; ++i) {
7878 ch = PyUnicode_READ(kind, data, i);
7879 fixed = 0;
7880 if (ch > 127) {
7881 if (Py_UNICODE_ISSPACE(ch))
7882 fixed = ' ';
7883 else {
7884 const int decimal = Py_UNICODE_TODECIMAL(ch);
7885 if (decimal >= 0)
7886 fixed = '0' + decimal;
7887 }
7888 if (fixed != 0) {
7889 if (fixed > maxchar)
7890 maxchar = fixed;
7891 PyUnicode_WRITE(kind, data, i, fixed);
7892 }
7893 else if (ch > maxchar)
7894 maxchar = ch;
7895 }
7896 else if (ch > maxchar)
7897 maxchar = ch;
7898 }
7899
7900 return maxchar;
7901}
7902
7903PyObject *
7904_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7905{
7906 if (!PyUnicode_Check(unicode)) {
7907 PyErr_BadInternalCall();
7908 return NULL;
7909 }
7910 if (PyUnicode_READY(unicode) == -1)
7911 return NULL;
7912 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7913 /* If the string is already ASCII, just return the same string */
7914 Py_INCREF(unicode);
7915 return unicode;
7916 }
7917 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7918}
7919
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007920PyObject *
7921PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7922 Py_ssize_t length)
7923{
7924 PyObject *result;
7925 Py_UNICODE *p; /* write pointer into result */
7926 Py_ssize_t i;
7927 /* Copy to a new string */
7928 result = (PyObject *)_PyUnicode_New(length);
7929 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7930 if (result == NULL)
7931 return result;
7932 p = PyUnicode_AS_UNICODE(result);
7933 /* Iterate over code points */
7934 for (i = 0; i < length; i++) {
7935 Py_UNICODE ch =s[i];
7936 if (ch > 127) {
7937 int decimal = Py_UNICODE_TODECIMAL(ch);
7938 if (decimal >= 0)
7939 p[i] = '0' + decimal;
7940 }
7941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007942 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7943 Py_DECREF(result);
7944 return NULL;
7945 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007946 return result;
7947}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007948/* --- Decimal Encoder ---------------------------------------------------- */
7949
Alexander Belopolsky40018472011-02-26 01:02:56 +00007950int
7951PyUnicode_EncodeDecimal(Py_UNICODE *s,
7952 Py_ssize_t length,
7953 char *output,
7954 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007955{
7956 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 PyObject *errorHandler = NULL;
7958 PyObject *exc = NULL;
7959 const char *encoding = "decimal";
7960 const char *reason = "invalid decimal Unicode string";
7961 /* the following variable is used for caching string comparisons
7962 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7963 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007964
7965 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 PyErr_BadArgument();
7967 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007968 }
7969
7970 p = s;
7971 end = s + length;
7972 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 register Py_UNICODE ch = *p;
7974 int decimal;
7975 PyObject *repunicode;
7976 Py_ssize_t repsize;
7977 Py_ssize_t newpos;
7978 Py_UNICODE *uni2;
7979 Py_UNICODE *collstart;
7980 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007981
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007983 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 ++p;
7985 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 decimal = Py_UNICODE_TODECIMAL(ch);
7988 if (decimal >= 0) {
7989 *output++ = '0' + decimal;
7990 ++p;
7991 continue;
7992 }
7993 if (0 < ch && ch < 256) {
7994 *output++ = (char)ch;
7995 ++p;
7996 continue;
7997 }
7998 /* All other characters are considered unencodable */
7999 collstart = p;
8000 collend = p+1;
8001 while (collend < end) {
8002 if ((0 < *collend && *collend < 256) ||
8003 !Py_UNICODE_ISSPACE(*collend) ||
8004 Py_UNICODE_TODECIMAL(*collend))
8005 break;
8006 }
8007 /* cache callback name lookup
8008 * (if not done yet, i.e. it's the first error) */
8009 if (known_errorHandler==-1) {
8010 if ((errors==NULL) || (!strcmp(errors, "strict")))
8011 known_errorHandler = 1;
8012 else if (!strcmp(errors, "replace"))
8013 known_errorHandler = 2;
8014 else if (!strcmp(errors, "ignore"))
8015 known_errorHandler = 3;
8016 else if (!strcmp(errors, "xmlcharrefreplace"))
8017 known_errorHandler = 4;
8018 else
8019 known_errorHandler = 0;
8020 }
8021 switch (known_errorHandler) {
8022 case 1: /* strict */
8023 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8024 goto onError;
8025 case 2: /* replace */
8026 for (p = collstart; p < collend; ++p)
8027 *output++ = '?';
8028 /* fall through */
8029 case 3: /* ignore */
8030 p = collend;
8031 break;
8032 case 4: /* xmlcharrefreplace */
8033 /* generate replacement (temporarily (mis)uses p) */
8034 for (p = collstart; p < collend; ++p)
8035 output += sprintf(output, "&#%d;", (int)*p);
8036 p = collend;
8037 break;
8038 default:
8039 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8040 encoding, reason, s, length, &exc,
8041 collstart-s, collend-s, &newpos);
8042 if (repunicode == NULL)
8043 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008044 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008045 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008046 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8047 Py_DECREF(repunicode);
8048 goto onError;
8049 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 /* generate replacement */
8051 repsize = PyUnicode_GET_SIZE(repunicode);
8052 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8053 Py_UNICODE ch = *uni2;
8054 if (Py_UNICODE_ISSPACE(ch))
8055 *output++ = ' ';
8056 else {
8057 decimal = Py_UNICODE_TODECIMAL(ch);
8058 if (decimal >= 0)
8059 *output++ = '0' + decimal;
8060 else if (0 < ch && ch < 256)
8061 *output++ = (char)ch;
8062 else {
8063 Py_DECREF(repunicode);
8064 raise_encode_exception(&exc, encoding,
8065 s, length, collstart-s, collend-s, reason);
8066 goto onError;
8067 }
8068 }
8069 }
8070 p = s + newpos;
8071 Py_DECREF(repunicode);
8072 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008073 }
8074 /* 0-terminate the output string */
8075 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 Py_XDECREF(exc);
8077 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008078 return 0;
8079
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 Py_XDECREF(exc);
8082 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008083 return -1;
8084}
8085
Guido van Rossumd57fd912000-03-10 22:53:23 +00008086/* --- Helpers ------------------------------------------------------------ */
8087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008088#include "stringlib/ucs1lib.h"
8089#include "stringlib/fastsearch.h"
8090#include "stringlib/partition.h"
8091#include "stringlib/split.h"
8092#include "stringlib/count.h"
8093#include "stringlib/find.h"
8094#include "stringlib/localeutil.h"
8095#include "stringlib/undef.h"
8096
8097#include "stringlib/ucs2lib.h"
8098#include "stringlib/fastsearch.h"
8099#include "stringlib/partition.h"
8100#include "stringlib/split.h"
8101#include "stringlib/count.h"
8102#include "stringlib/find.h"
8103#include "stringlib/localeutil.h"
8104#include "stringlib/undef.h"
8105
8106#include "stringlib/ucs4lib.h"
8107#include "stringlib/fastsearch.h"
8108#include "stringlib/partition.h"
8109#include "stringlib/split.h"
8110#include "stringlib/count.h"
8111#include "stringlib/find.h"
8112#include "stringlib/localeutil.h"
8113#include "stringlib/undef.h"
8114
8115static Py_ssize_t
8116any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8117 const Py_UCS1*, Py_ssize_t,
8118 Py_ssize_t, Py_ssize_t),
8119 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8120 const Py_UCS2*, Py_ssize_t,
8121 Py_ssize_t, Py_ssize_t),
8122 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8123 const Py_UCS4*, Py_ssize_t,
8124 Py_ssize_t, Py_ssize_t),
8125 PyObject* s1, PyObject* s2,
8126 Py_ssize_t start,
8127 Py_ssize_t end)
8128{
8129 int kind1, kind2, kind;
8130 void *buf1, *buf2;
8131 Py_ssize_t len1, len2, result;
8132
8133 kind1 = PyUnicode_KIND(s1);
8134 kind2 = PyUnicode_KIND(s2);
8135 kind = kind1 > kind2 ? kind1 : kind2;
8136 buf1 = PyUnicode_DATA(s1);
8137 buf2 = PyUnicode_DATA(s2);
8138 if (kind1 != kind)
8139 buf1 = _PyUnicode_AsKind(s1, kind);
8140 if (!buf1)
8141 return -2;
8142 if (kind2 != kind)
8143 buf2 = _PyUnicode_AsKind(s2, kind);
8144 if (!buf2) {
8145 if (kind1 != kind) PyMem_Free(buf1);
8146 return -2;
8147 }
8148 len1 = PyUnicode_GET_LENGTH(s1);
8149 len2 = PyUnicode_GET_LENGTH(s2);
8150
8151 switch(kind) {
8152 case PyUnicode_1BYTE_KIND:
8153 result = ucs1(buf1, len1, buf2, len2, start, end);
8154 break;
8155 case PyUnicode_2BYTE_KIND:
8156 result = ucs2(buf1, len1, buf2, len2, start, end);
8157 break;
8158 case PyUnicode_4BYTE_KIND:
8159 result = ucs4(buf1, len1, buf2, len2, start, end);
8160 break;
8161 default:
8162 assert(0); result = -2;
8163 }
8164
8165 if (kind1 != kind)
8166 PyMem_Free(buf1);
8167 if (kind2 != kind)
8168 PyMem_Free(buf2);
8169
8170 return result;
8171}
8172
8173Py_ssize_t
8174_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8175 Py_ssize_t n_buffer,
8176 void *digits, Py_ssize_t n_digits,
8177 Py_ssize_t min_width,
8178 const char *grouping,
8179 const char *thousands_sep)
8180{
8181 switch(kind) {
8182 case PyUnicode_1BYTE_KIND:
8183 return _PyUnicode_ucs1_InsertThousandsGrouping(
8184 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8185 min_width, grouping, thousands_sep);
8186 case PyUnicode_2BYTE_KIND:
8187 return _PyUnicode_ucs2_InsertThousandsGrouping(
8188 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8189 min_width, grouping, thousands_sep);
8190 case PyUnicode_4BYTE_KIND:
8191 return _PyUnicode_ucs4_InsertThousandsGrouping(
8192 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8193 min_width, grouping, thousands_sep);
8194 }
8195 assert(0);
8196 return -1;
8197}
8198
8199
Eric Smith8c663262007-08-25 02:26:07 +00008200#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008201#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008202
Thomas Wouters477c8d52006-05-27 19:21:47 +00008203#include "stringlib/count.h"
8204#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008205
Thomas Wouters477c8d52006-05-27 19:21:47 +00008206/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008207#define ADJUST_INDICES(start, end, len) \
8208 if (end > len) \
8209 end = len; \
8210 else if (end < 0) { \
8211 end += len; \
8212 if (end < 0) \
8213 end = 0; \
8214 } \
8215 if (start < 0) { \
8216 start += len; \
8217 if (start < 0) \
8218 start = 0; \
8219 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008220
Alexander Belopolsky40018472011-02-26 01:02:56 +00008221Py_ssize_t
8222PyUnicode_Count(PyObject *str,
8223 PyObject *substr,
8224 Py_ssize_t start,
8225 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008227 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008228 PyUnicodeObject* str_obj;
8229 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 int kind1, kind2, kind;
8231 void *buf1 = NULL, *buf2 = NULL;
8232 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008233
Thomas Wouters477c8d52006-05-27 19:21:47 +00008234 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008237 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008238 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 Py_DECREF(str_obj);
8240 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 }
Tim Petersced69f82003-09-16 20:30:58 +00008242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 kind1 = PyUnicode_KIND(str_obj);
8244 kind2 = PyUnicode_KIND(sub_obj);
8245 kind = kind1 > kind2 ? kind1 : kind2;
8246 buf1 = PyUnicode_DATA(str_obj);
8247 if (kind1 != kind)
8248 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8249 if (!buf1)
8250 goto onError;
8251 buf2 = PyUnicode_DATA(sub_obj);
8252 if (kind2 != kind)
8253 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8254 if (!buf2)
8255 goto onError;
8256 len1 = PyUnicode_GET_LENGTH(str_obj);
8257 len2 = PyUnicode_GET_LENGTH(sub_obj);
8258
8259 ADJUST_INDICES(start, end, len1);
8260 switch(kind) {
8261 case PyUnicode_1BYTE_KIND:
8262 result = ucs1lib_count(
8263 ((Py_UCS1*)buf1) + start, end - start,
8264 buf2, len2, PY_SSIZE_T_MAX
8265 );
8266 break;
8267 case PyUnicode_2BYTE_KIND:
8268 result = ucs2lib_count(
8269 ((Py_UCS2*)buf1) + start, end - start,
8270 buf2, len2, PY_SSIZE_T_MAX
8271 );
8272 break;
8273 case PyUnicode_4BYTE_KIND:
8274 result = ucs4lib_count(
8275 ((Py_UCS4*)buf1) + start, end - start,
8276 buf2, len2, PY_SSIZE_T_MAX
8277 );
8278 break;
8279 default:
8280 assert(0); result = 0;
8281 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008282
8283 Py_DECREF(sub_obj);
8284 Py_DECREF(str_obj);
8285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 if (kind1 != kind)
8287 PyMem_Free(buf1);
8288 if (kind2 != kind)
8289 PyMem_Free(buf2);
8290
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008292 onError:
8293 Py_DECREF(sub_obj);
8294 Py_DECREF(str_obj);
8295 if (kind1 != kind && buf1)
8296 PyMem_Free(buf1);
8297 if (kind2 != kind && buf2)
8298 PyMem_Free(buf2);
8299 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300}
8301
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302Py_ssize_t
8303PyUnicode_Find(PyObject *str,
8304 PyObject *sub,
8305 Py_ssize_t start,
8306 Py_ssize_t end,
8307 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008309 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008310
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008314 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_DECREF(str);
8317 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 }
Tim Petersced69f82003-09-16 20:30:58 +00008319
Thomas Wouters477c8d52006-05-27 19:21:47 +00008320 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321 result = any_find_slice(
8322 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8323 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324 );
8325 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008326 result = any_find_slice(
8327 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8328 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008329 );
8330
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008332 Py_DECREF(sub);
8333
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 return result;
8335}
8336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337Py_ssize_t
8338PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8339 Py_ssize_t start, Py_ssize_t end,
8340 int direction)
8341{
8342 char *result;
8343 int kind;
8344 if (PyUnicode_READY(str) == -1)
8345 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008346 if (start < 0 || end < 0) {
8347 PyErr_SetString(PyExc_IndexError, "string index out of range");
8348 return -2;
8349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 if (end > PyUnicode_GET_LENGTH(str))
8351 end = PyUnicode_GET_LENGTH(str);
8352 kind = PyUnicode_KIND(str);
8353 result = findchar(PyUnicode_1BYTE_DATA(str)
8354 + PyUnicode_KIND_SIZE(kind, start),
8355 kind,
8356 end-start, ch, direction);
8357 if (!result)
8358 return -1;
8359 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8360}
8361
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362static int
8363tailmatch(PyUnicodeObject *self,
8364 PyUnicodeObject *substring,
8365 Py_ssize_t start,
8366 Py_ssize_t end,
8367 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 int kind_self;
8370 int kind_sub;
8371 void *data_self;
8372 void *data_sub;
8373 Py_ssize_t offset;
8374 Py_ssize_t i;
8375 Py_ssize_t end_sub;
8376
8377 if (PyUnicode_READY(self) == -1 ||
8378 PyUnicode_READY(substring) == -1)
8379 return 0;
8380
8381 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 return 1;
8383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8385 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 kind_self = PyUnicode_KIND(self);
8390 data_self = PyUnicode_DATA(self);
8391 kind_sub = PyUnicode_KIND(substring);
8392 data_sub = PyUnicode_DATA(substring);
8393 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8394
8395 if (direction > 0)
8396 offset = end;
8397 else
8398 offset = start;
8399
8400 if (PyUnicode_READ(kind_self, data_self, offset) ==
8401 PyUnicode_READ(kind_sub, data_sub, 0) &&
8402 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8403 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8404 /* If both are of the same kind, memcmp is sufficient */
8405 if (kind_self == kind_sub) {
8406 return ! memcmp((char *)data_self +
8407 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8408 data_sub,
8409 PyUnicode_GET_LENGTH(substring) *
8410 PyUnicode_CHARACTER_SIZE(substring));
8411 }
8412 /* otherwise we have to compare each character by first accesing it */
8413 else {
8414 /* We do not need to compare 0 and len(substring)-1 because
8415 the if statement above ensured already that they are equal
8416 when we end up here. */
8417 // TODO: honor direction and do a forward or backwards search
8418 for (i = 1; i < end_sub; ++i) {
8419 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8420 PyUnicode_READ(kind_sub, data_sub, i))
8421 return 0;
8422 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 }
8426
8427 return 0;
8428}
8429
Alexander Belopolsky40018472011-02-26 01:02:56 +00008430Py_ssize_t
8431PyUnicode_Tailmatch(PyObject *str,
8432 PyObject *substr,
8433 Py_ssize_t start,
8434 Py_ssize_t end,
8435 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008437 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008438
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 str = PyUnicode_FromObject(str);
8440 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442 substr = PyUnicode_FromObject(substr);
8443 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 Py_DECREF(str);
8445 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 }
Tim Petersced69f82003-09-16 20:30:58 +00008447
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 (PyUnicodeObject *)substr,
8450 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 Py_DECREF(str);
8452 Py_DECREF(substr);
8453 return result;
8454}
8455
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456/* Apply fixfct filter to the Unicode object self and return a
8457 reference to the modified object */
8458
Alexander Belopolsky40018472011-02-26 01:02:56 +00008459static PyObject *
8460fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 PyObject *u;
8464 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 if (PyUnicode_READY(self) == -1)
8467 return NULL;
8468 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8469 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8470 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8475 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 /* fix functions return the new maximum character in a string,
8478 if the kind of the resulting unicode object does not change,
8479 everything is fine. Otherwise we need to change the string kind
8480 and re-run the fix function. */
8481 maxchar_new = fixfct((PyUnicodeObject*)u);
8482 if (maxchar_new == 0)
8483 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8484 else if (maxchar_new <= 127)
8485 maxchar_new = 127;
8486 else if (maxchar_new <= 255)
8487 maxchar_new = 255;
8488 else if (maxchar_new <= 65535)
8489 maxchar_new = 65535;
8490 else
8491 maxchar_new = 1114111; /* 0x10ffff */
8492
8493 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 /* fixfct should return TRUE if it modified the buffer. If
8495 FALSE, return a reference to the original buffer instead
8496 (to save space, not time) */
8497 Py_INCREF(self);
8498 Py_DECREF(u);
8499 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 else if (maxchar_new == maxchar_old) {
8502 return u;
8503 }
8504 else {
8505 /* In case the maximum character changed, we need to
8506 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008507 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 if (v == NULL) {
8509 Py_DECREF(u);
8510 return NULL;
8511 }
8512 if (maxchar_new > maxchar_old) {
8513 /* If the maxchar increased so that the kind changed, not all
8514 characters are representable anymore and we need to fix the
8515 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008516 if (PyUnicode_CopyCharacters(v, 0,
8517 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008518 PyUnicode_GET_LENGTH(self)) < 0)
8519 {
8520 Py_DECREF(u);
8521 return NULL;
8522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 maxchar_old = fixfct((PyUnicodeObject*)v);
8524 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8525 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008526 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008527 if (PyUnicode_CopyCharacters(v, 0,
8528 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008529 PyUnicode_GET_LENGTH(self)) < 0)
8530 {
8531 Py_DECREF(u);
8532 return NULL;
8533 }
8534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535
8536 Py_DECREF(u);
8537 return v;
8538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539}
8540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 /* No need to call PyUnicode_READY(self) because this function is only
8545 called as a callback from fixup() which does it already. */
8546 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8547 const int kind = PyUnicode_KIND(self);
8548 void *data = PyUnicode_DATA(self);
8549 int touched = 0;
8550 Py_UCS4 maxchar = 0;
8551 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 for (i = 0; i < len; ++i) {
8554 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8555 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8556 if (up != ch) {
8557 if (up > maxchar)
8558 maxchar = up;
8559 PyUnicode_WRITE(kind, data, i, up);
8560 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 else if (ch > maxchar)
8563 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
8565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 if (touched)
8567 return maxchar;
8568 else
8569 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570}
8571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008573fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8576 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8577 const int kind = PyUnicode_KIND(self);
8578 void *data = PyUnicode_DATA(self);
8579 int touched = 0;
8580 Py_UCS4 maxchar = 0;
8581 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583 for(i = 0; i < len; ++i) {
8584 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8585 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8586 if (lo != ch) {
8587 if (lo > maxchar)
8588 maxchar = lo;
8589 PyUnicode_WRITE(kind, data, i, lo);
8590 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 else if (ch > maxchar)
8593 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594 }
8595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 if (touched)
8597 return maxchar;
8598 else
8599 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600}
8601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008603fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8606 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8607 const int kind = PyUnicode_KIND(self);
8608 void *data = PyUnicode_DATA(self);
8609 int touched = 0;
8610 Py_UCS4 maxchar = 0;
8611 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 for(i = 0; i < len; ++i) {
8614 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8615 Py_UCS4 nu = 0;
8616
8617 if (Py_UNICODE_ISUPPER(ch))
8618 nu = Py_UNICODE_TOLOWER(ch);
8619 else if (Py_UNICODE_ISLOWER(ch))
8620 nu = Py_UNICODE_TOUPPER(ch);
8621
8622 if (nu != 0) {
8623 if (nu > maxchar)
8624 maxchar = nu;
8625 PyUnicode_WRITE(kind, data, i, nu);
8626 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 else if (ch > maxchar)
8629 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 }
8631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 if (touched)
8633 return maxchar;
8634 else
8635 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636}
8637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8642 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8643 const int kind = PyUnicode_KIND(self);
8644 void *data = PyUnicode_DATA(self);
8645 int touched = 0;
8646 Py_UCS4 maxchar = 0;
8647 Py_ssize_t i = 0;
8648 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008649
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008650 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652
8653 ch = PyUnicode_READ(kind, data, i);
8654 if (!Py_UNICODE_ISUPPER(ch)) {
8655 maxchar = Py_UNICODE_TOUPPER(ch);
8656 PyUnicode_WRITE(kind, data, i, maxchar);
8657 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 ++i;
8660 for(; i < len; ++i) {
8661 ch = PyUnicode_READ(kind, data, i);
8662 if (!Py_UNICODE_ISLOWER(ch)) {
8663 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8664 if (lo > maxchar)
8665 maxchar = lo;
8666 PyUnicode_WRITE(kind, data, i, lo);
8667 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 else if (ch > maxchar)
8670 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672
8673 if (touched)
8674 return maxchar;
8675 else
8676 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677}
8678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008680fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8683 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8684 const int kind = PyUnicode_KIND(self);
8685 void *data = PyUnicode_DATA(self);
8686 Py_UCS4 maxchar = 0;
8687 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 int previous_is_cased;
8689
8690 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 if (len == 1) {
8692 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8693 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8694 if (ti != ch) {
8695 PyUnicode_WRITE(kind, data, i, ti);
8696 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 }
8698 else
8699 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 for(; i < len; ++i) {
8703 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8704 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008705
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 nu = Py_UNICODE_TOTITLE(ch);
8710
8711 if (nu > maxchar)
8712 maxchar = nu;
8713 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008714
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 if (Py_UNICODE_ISLOWER(ch) ||
8716 Py_UNICODE_ISUPPER(ch) ||
8717 Py_UNICODE_ISTITLE(ch))
8718 previous_is_cased = 1;
8719 else
8720 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723}
8724
Tim Peters8ce9f162004-08-27 01:49:32 +00008725PyObject *
8726PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008729 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008731 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008732 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8733 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008734 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 Py_ssize_t sz, i, res_offset;
8736 Py_UCS4 maxchar = 0;
8737 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738
Tim Peters05eba1f2004-08-27 21:32:02 +00008739 fseq = PySequence_Fast(seq, "");
8740 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008741 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008742 }
8743
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008744 /* NOTE: the following code can't call back into Python code,
8745 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008746 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008747
Tim Peters05eba1f2004-08-27 21:32:02 +00008748 seqlen = PySequence_Fast_GET_SIZE(fseq);
8749 /* If empty sequence, return u"". */
8750 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008752 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008753 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008754 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008755 /* If singleton sequence with an exact Unicode, return that. */
8756 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 item = items[0];
8758 if (PyUnicode_CheckExact(item)) {
8759 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 goto Done;
8762 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008763 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008764 else {
8765 /* Set up sep and seplen */
8766 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 /* fall back to a blank space separator */
8768 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008769 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008771 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008772 else {
8773 if (!PyUnicode_Check(separator)) {
8774 PyErr_Format(PyExc_TypeError,
8775 "separator: expected str instance,"
8776 " %.80s found",
8777 Py_TYPE(separator)->tp_name);
8778 goto onError;
8779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 if (PyUnicode_READY(separator) == -1)
8781 goto onError;
8782 sep = separator;
8783 seplen = PyUnicode_GET_LENGTH(separator);
8784 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8785 /* inc refcount to keep this code path symetric with the
8786 above case of a blank separator */
8787 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008788 }
8789 }
8790
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008791 /* There are at least two things to join, or else we have a subclass
8792 * of str in the sequence.
8793 * Do a pre-pass to figure out the total amount of space we'll
8794 * need (sz), and see whether all argument are strings.
8795 */
8796 sz = 0;
8797 for (i = 0; i < seqlen; i++) {
8798 const Py_ssize_t old_sz = sz;
8799 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 if (!PyUnicode_Check(item)) {
8801 PyErr_Format(PyExc_TypeError,
8802 "sequence item %zd: expected str instance,"
8803 " %.80s found",
8804 i, Py_TYPE(item)->tp_name);
8805 goto onError;
8806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 if (PyUnicode_READY(item) == -1)
8808 goto onError;
8809 sz += PyUnicode_GET_LENGTH(item);
8810 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8811 if (item_maxchar > maxchar)
8812 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008813 if (i != 0)
8814 sz += seplen;
8815 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8816 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008818 goto onError;
8819 }
8820 }
Tim Petersced69f82003-09-16 20:30:58 +00008821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008823 if (res == NULL)
8824 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008825
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008826 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008828 Py_ssize_t itemlen;
8829 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 /* Copy item, and maybe the separator. */
8832 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008833 if (PyUnicode_CopyCharacters(res, res_offset,
8834 sep, 0, seplen) < 0)
8835 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008838 if (PyUnicode_CopyCharacters(res, res_offset,
8839 item, 0, itemlen) < 0)
8840 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008842 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008844
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008846 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 Py_XDECREF(sep);
8848 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008851 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008853 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 return NULL;
8855}
8856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857#define FILL(kind, data, value, start, length) \
8858 do { \
8859 Py_ssize_t i_ = 0; \
8860 assert(kind != PyUnicode_WCHAR_KIND); \
8861 switch ((kind)) { \
8862 case PyUnicode_1BYTE_KIND: { \
8863 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8864 memset(to_, (unsigned char)value, length); \
8865 break; \
8866 } \
8867 case PyUnicode_2BYTE_KIND: { \
8868 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8869 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8870 break; \
8871 } \
8872 default: { \
8873 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8874 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8875 break; \
8876 } \
8877 } \
8878 } while (0)
8879
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880static PyUnicodeObject *
8881pad(PyUnicodeObject *self,
8882 Py_ssize_t left,
8883 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 PyObject *u;
8887 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008888 int kind;
8889 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890
8891 if (left < 0)
8892 left = 0;
8893 if (right < 0)
8894 right = 0;
8895
Tim Peters7a29bd52001-09-12 03:03:31 +00008896 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897 Py_INCREF(self);
8898 return self;
8899 }
8900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8902 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008903 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8904 return NULL;
8905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8907 if (fill > maxchar)
8908 maxchar = fill;
8909 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008910 if (!u)
8911 return NULL;
8912
8913 kind = PyUnicode_KIND(u);
8914 data = PyUnicode_DATA(u);
8915 if (left)
8916 FILL(kind, data, fill, 0, left);
8917 if (right)
8918 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008919 if (PyUnicode_CopyCharacters(u, left,
8920 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008921 _PyUnicode_LENGTH(self)) < 0)
8922 {
8923 Py_DECREF(u);
8924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925 }
8926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930
Alexander Belopolsky40018472011-02-26 01:02:56 +00008931PyObject *
8932PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935
8936 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 switch(PyUnicode_KIND(string)) {
8941 case PyUnicode_1BYTE_KIND:
8942 list = ucs1lib_splitlines(
8943 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8944 PyUnicode_GET_LENGTH(string), keepends);
8945 break;
8946 case PyUnicode_2BYTE_KIND:
8947 list = ucs2lib_splitlines(
8948 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8949 PyUnicode_GET_LENGTH(string), keepends);
8950 break;
8951 case PyUnicode_4BYTE_KIND:
8952 list = ucs4lib_splitlines(
8953 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8954 PyUnicode_GET_LENGTH(string), keepends);
8955 break;
8956 default:
8957 assert(0);
8958 list = 0;
8959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960 Py_DECREF(string);
8961 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962}
8963
Alexander Belopolsky40018472011-02-26 01:02:56 +00008964static PyObject *
8965split(PyUnicodeObject *self,
8966 PyUnicodeObject *substring,
8967 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 int kind1, kind2, kind;
8970 void *buf1, *buf2;
8971 Py_ssize_t len1, len2;
8972 PyObject* out;
8973
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008975 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 if (PyUnicode_READY(self) == -1)
8978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 if (substring == NULL)
8981 switch(PyUnicode_KIND(self)) {
8982 case PyUnicode_1BYTE_KIND:
8983 return ucs1lib_split_whitespace(
8984 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8985 PyUnicode_GET_LENGTH(self), maxcount
8986 );
8987 case PyUnicode_2BYTE_KIND:
8988 return ucs2lib_split_whitespace(
8989 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8990 PyUnicode_GET_LENGTH(self), maxcount
8991 );
8992 case PyUnicode_4BYTE_KIND:
8993 return ucs4lib_split_whitespace(
8994 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8995 PyUnicode_GET_LENGTH(self), maxcount
8996 );
8997 default:
8998 assert(0);
8999 return NULL;
9000 }
9001
9002 if (PyUnicode_READY(substring) == -1)
9003 return NULL;
9004
9005 kind1 = PyUnicode_KIND(self);
9006 kind2 = PyUnicode_KIND(substring);
9007 kind = kind1 > kind2 ? kind1 : kind2;
9008 buf1 = PyUnicode_DATA(self);
9009 buf2 = PyUnicode_DATA(substring);
9010 if (kind1 != kind)
9011 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9012 if (!buf1)
9013 return NULL;
9014 if (kind2 != kind)
9015 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9016 if (!buf2) {
9017 if (kind1 != kind) PyMem_Free(buf1);
9018 return NULL;
9019 }
9020 len1 = PyUnicode_GET_LENGTH(self);
9021 len2 = PyUnicode_GET_LENGTH(substring);
9022
9023 switch(kind) {
9024 case PyUnicode_1BYTE_KIND:
9025 out = ucs1lib_split(
9026 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9027 break;
9028 case PyUnicode_2BYTE_KIND:
9029 out = ucs2lib_split(
9030 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9031 break;
9032 case PyUnicode_4BYTE_KIND:
9033 out = ucs4lib_split(
9034 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9035 break;
9036 default:
9037 out = NULL;
9038 }
9039 if (kind1 != kind)
9040 PyMem_Free(buf1);
9041 if (kind2 != kind)
9042 PyMem_Free(buf2);
9043 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044}
9045
Alexander Belopolsky40018472011-02-26 01:02:56 +00009046static PyObject *
9047rsplit(PyUnicodeObject *self,
9048 PyUnicodeObject *substring,
9049 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 int kind1, kind2, kind;
9052 void *buf1, *buf2;
9053 Py_ssize_t len1, len2;
9054 PyObject* out;
9055
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009056 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009057 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 if (PyUnicode_READY(self) == -1)
9060 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 if (substring == NULL)
9063 switch(PyUnicode_KIND(self)) {
9064 case PyUnicode_1BYTE_KIND:
9065 return ucs1lib_rsplit_whitespace(
9066 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9067 PyUnicode_GET_LENGTH(self), maxcount
9068 );
9069 case PyUnicode_2BYTE_KIND:
9070 return ucs2lib_rsplit_whitespace(
9071 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9072 PyUnicode_GET_LENGTH(self), maxcount
9073 );
9074 case PyUnicode_4BYTE_KIND:
9075 return ucs4lib_rsplit_whitespace(
9076 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9077 PyUnicode_GET_LENGTH(self), maxcount
9078 );
9079 default:
9080 assert(0);
9081 return NULL;
9082 }
9083
9084 if (PyUnicode_READY(substring) == -1)
9085 return NULL;
9086
9087 kind1 = PyUnicode_KIND(self);
9088 kind2 = PyUnicode_KIND(substring);
9089 kind = kind1 > kind2 ? kind1 : kind2;
9090 buf1 = PyUnicode_DATA(self);
9091 buf2 = PyUnicode_DATA(substring);
9092 if (kind1 != kind)
9093 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9094 if (!buf1)
9095 return NULL;
9096 if (kind2 != kind)
9097 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9098 if (!buf2) {
9099 if (kind1 != kind) PyMem_Free(buf1);
9100 return NULL;
9101 }
9102 len1 = PyUnicode_GET_LENGTH(self);
9103 len2 = PyUnicode_GET_LENGTH(substring);
9104
9105 switch(kind) {
9106 case PyUnicode_1BYTE_KIND:
9107 out = ucs1lib_rsplit(
9108 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9109 break;
9110 case PyUnicode_2BYTE_KIND:
9111 out = ucs2lib_rsplit(
9112 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9113 break;
9114 case PyUnicode_4BYTE_KIND:
9115 out = ucs4lib_rsplit(
9116 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9117 break;
9118 default:
9119 out = NULL;
9120 }
9121 if (kind1 != kind)
9122 PyMem_Free(buf1);
9123 if (kind2 != kind)
9124 PyMem_Free(buf2);
9125 return out;
9126}
9127
9128static Py_ssize_t
9129anylib_find(int kind, void *buf1, Py_ssize_t len1,
9130 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9131{
9132 switch(kind) {
9133 case PyUnicode_1BYTE_KIND:
9134 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9135 case PyUnicode_2BYTE_KIND:
9136 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9137 case PyUnicode_4BYTE_KIND:
9138 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9139 }
9140 assert(0);
9141 return -1;
9142}
9143
9144static Py_ssize_t
9145anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9146 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9147{
9148 switch(kind) {
9149 case PyUnicode_1BYTE_KIND:
9150 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9151 case PyUnicode_2BYTE_KIND:
9152 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9153 case PyUnicode_4BYTE_KIND:
9154 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9155 }
9156 assert(0);
9157 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009158}
9159
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161replace(PyObject *self, PyObject *str1,
9162 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 PyObject *u;
9165 char *sbuf = PyUnicode_DATA(self);
9166 char *buf1 = PyUnicode_DATA(str1);
9167 char *buf2 = PyUnicode_DATA(str2);
9168 int srelease = 0, release1 = 0, release2 = 0;
9169 int skind = PyUnicode_KIND(self);
9170 int kind1 = PyUnicode_KIND(str1);
9171 int kind2 = PyUnicode_KIND(str2);
9172 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9173 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9174 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175
9176 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009179 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 if (skind < kind1)
9182 /* substring too wide to be present */
9183 goto nothing;
9184
9185 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009186 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009187 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009189 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009191 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 Py_UCS4 u1, u2, maxchar;
9193 int mayshrink, rkind;
9194 u1 = PyUnicode_READ_CHAR(str1, 0);
9195 if (!findchar(sbuf, PyUnicode_KIND(self),
9196 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009197 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 u2 = PyUnicode_READ_CHAR(str2, 0);
9199 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9200 /* Replacing u1 with u2 may cause a maxchar reduction in the
9201 result string. */
9202 mayshrink = maxchar > 127;
9203 if (u2 > maxchar) {
9204 maxchar = u2;
9205 mayshrink = 0;
9206 }
9207 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009208 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009210 if (PyUnicode_CopyCharacters(u, 0,
9211 (PyObject*)self, 0, slen) < 0)
9212 {
9213 Py_DECREF(u);
9214 return NULL;
9215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 rkind = PyUnicode_KIND(u);
9217 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9218 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219 if (--maxcount < 0)
9220 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 if (mayshrink) {
9224 PyObject *tmp = u;
9225 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9226 PyUnicode_GET_LENGTH(tmp));
9227 Py_DECREF(tmp);
9228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 int rkind = skind;
9231 char *res;
9232 if (kind1 < rkind) {
9233 /* widen substring */
9234 buf1 = _PyUnicode_AsKind(str1, rkind);
9235 if (!buf1) goto error;
9236 release1 = 1;
9237 }
9238 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009239 if (i < 0)
9240 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 if (rkind > kind2) {
9242 /* widen replacement */
9243 buf2 = _PyUnicode_AsKind(str2, rkind);
9244 if (!buf2) goto error;
9245 release2 = 1;
9246 }
9247 else if (rkind < kind2) {
9248 /* widen self and buf1 */
9249 rkind = kind2;
9250 if (release1) PyMem_Free(buf1);
9251 sbuf = _PyUnicode_AsKind(self, rkind);
9252 if (!sbuf) goto error;
9253 srelease = 1;
9254 buf1 = _PyUnicode_AsKind(str1, rkind);
9255 if (!buf1) goto error;
9256 release1 = 1;
9257 }
9258 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9259 if (!res) {
9260 PyErr_NoMemory();
9261 goto error;
9262 }
9263 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009264 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9266 buf2,
9267 PyUnicode_KIND_SIZE(rkind, len2));
9268 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009269
9270 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9272 slen-i,
9273 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009274 if (i == -1)
9275 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9277 buf2,
9278 PyUnicode_KIND_SIZE(rkind, len2));
9279 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281
9282 u = PyUnicode_FromKindAndData(rkind, res, slen);
9283 PyMem_Free(res);
9284 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 Py_ssize_t n, i, j, ires;
9289 Py_ssize_t product, new_size;
9290 int rkind = skind;
9291 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 if (kind1 < rkind) {
9294 buf1 = _PyUnicode_AsKind(str1, rkind);
9295 if (!buf1) goto error;
9296 release1 = 1;
9297 }
9298 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009299 if (n == 0)
9300 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 if (kind2 < rkind) {
9302 buf2 = _PyUnicode_AsKind(str2, rkind);
9303 if (!buf2) goto error;
9304 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 else if (kind2 > rkind) {
9307 rkind = kind2;
9308 sbuf = _PyUnicode_AsKind(self, rkind);
9309 if (!sbuf) goto error;
9310 srelease = 1;
9311 if (release1) PyMem_Free(buf1);
9312 buf1 = _PyUnicode_AsKind(str1, rkind);
9313 if (!buf1) goto error;
9314 release1 = 1;
9315 }
9316 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9317 PyUnicode_GET_LENGTH(str1))); */
9318 product = n * (len2-len1);
9319 if ((product / (len2-len1)) != n) {
9320 PyErr_SetString(PyExc_OverflowError,
9321 "replace string is too long");
9322 goto error;
9323 }
9324 new_size = slen + product;
9325 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9326 PyErr_SetString(PyExc_OverflowError,
9327 "replace string is too long");
9328 goto error;
9329 }
9330 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9331 if (!res)
9332 goto error;
9333 ires = i = 0;
9334 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009335 while (n-- > 0) {
9336 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 j = anylib_find(rkind,
9338 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9339 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009340 if (j == -1)
9341 break;
9342 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009343 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9345 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9346 PyUnicode_KIND_SIZE(rkind, j-i));
9347 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009348 }
9349 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 if (len2 > 0) {
9351 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9352 buf2,
9353 PyUnicode_KIND_SIZE(rkind, len2));
9354 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009359 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9361 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9362 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009363 } else {
9364 /* interleave */
9365 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9367 buf2,
9368 PyUnicode_KIND_SIZE(rkind, len2));
9369 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009370 if (--n <= 0)
9371 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9373 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9374 PyUnicode_KIND_SIZE(rkind, 1));
9375 ires++;
9376 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9379 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9380 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009383 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 if (srelease)
9386 PyMem_FREE(sbuf);
9387 if (release1)
9388 PyMem_FREE(buf1);
9389 if (release2)
9390 PyMem_FREE(buf2);
9391 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009392
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009394 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (srelease)
9396 PyMem_FREE(sbuf);
9397 if (release1)
9398 PyMem_FREE(buf1);
9399 if (release2)
9400 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009401 if (PyUnicode_CheckExact(self)) {
9402 Py_INCREF(self);
9403 return (PyObject *) self;
9404 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009405 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 error:
9407 if (srelease && sbuf)
9408 PyMem_FREE(sbuf);
9409 if (release1 && buf1)
9410 PyMem_FREE(buf1);
9411 if (release2 && buf2)
9412 PyMem_FREE(buf2);
9413 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414}
9415
9416/* --- Unicode Object Methods --------------------------------------------- */
9417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009418PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420\n\
9421Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009422characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423
9424static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009425unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009426{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 return fixup(self, fixtitle);
9428}
9429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009430PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009431 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432\n\
9433Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009434have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435
9436static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009437unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 return fixup(self, fixcapitalize);
9440}
9441
9442#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009443PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445\n\
9446Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009447normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448
9449static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009450unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451{
9452 PyObject *list;
9453 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009454 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456 /* Split into words */
9457 list = split(self, NULL, -1);
9458 if (!list)
9459 return NULL;
9460
9461 /* Capitalize each word */
9462 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9463 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009464 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 if (item == NULL)
9466 goto onError;
9467 Py_DECREF(PyList_GET_ITEM(list, i));
9468 PyList_SET_ITEM(list, i, item);
9469 }
9470
9471 /* Join the words to form a new string */
9472 item = PyUnicode_Join(NULL, list);
9473
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 Py_DECREF(list);
9476 return (PyObject *)item;
9477}
9478#endif
9479
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009480/* Argument converter. Coerces to a single unicode character */
9481
9482static int
9483convert_uc(PyObject *obj, void *addr)
9484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009486 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009487
Benjamin Peterson14339b62009-01-31 16:36:08 +00009488 uniobj = PyUnicode_FromObject(obj);
9489 if (uniobj == NULL) {
9490 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009491 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009492 return 0;
9493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009495 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009497 Py_DECREF(uniobj);
9498 return 0;
9499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009501 Py_DECREF(uniobj);
9502 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009503}
9504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009505PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009508Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009509done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510
9511static PyObject *
9512unicode_center(PyUnicodeObject *self, PyObject *args)
9513{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009514 Py_ssize_t marg, left;
9515 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 Py_UCS4 fillchar = ' ';
9517
Victor Stinnere9a29352011-10-01 02:14:59 +02009518 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520
Victor Stinnere9a29352011-10-01 02:14:59 +02009521 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 return NULL;
9523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 Py_INCREF(self);
9526 return (PyObject*) self;
9527 }
9528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 left = marg / 2 + (marg & width & 1);
9531
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009532 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533}
9534
Marc-André Lemburge5034372000-08-08 08:04:29 +00009535#if 0
9536
9537/* This code should go into some future Unicode collation support
9538 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009539 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009540
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009541/* speedy UTF-16 code point order comparison */
9542/* gleaned from: */
9543/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9544
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009545static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009546{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009547 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009548 0, 0, 0, 0, 0, 0, 0, 0,
9549 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009550 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009551};
9552
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553static int
9554unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9555{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009556 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009557
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 Py_UNICODE *s1 = str1->str;
9559 Py_UNICODE *s2 = str2->str;
9560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 len1 = str1->_base._base.length;
9562 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009563
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009565 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009566
9567 c1 = *s1++;
9568 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009569
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 if (c1 > (1<<11) * 26)
9571 c1 += utf16Fixup[c1>>11];
9572 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009573 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009574 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009575
9576 if (c1 != c2)
9577 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009578
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009579 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580 }
9581
9582 return (len1 < len2) ? -1 : (len1 != len2);
9583}
9584
Marc-André Lemburge5034372000-08-08 08:04:29 +00009585#else
9586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587/* This function assumes that str1 and str2 are readied by the caller. */
9588
Marc-André Lemburge5034372000-08-08 08:04:29 +00009589static int
9590unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 int kind1, kind2;
9593 void *data1, *data2;
9594 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 kind1 = PyUnicode_KIND(str1);
9597 kind2 = PyUnicode_KIND(str2);
9598 data1 = PyUnicode_DATA(str1);
9599 data2 = PyUnicode_DATA(str2);
9600 len1 = PyUnicode_GET_LENGTH(str1);
9601 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 for (i = 0; i < len1 && i < len2; ++i) {
9604 Py_UCS4 c1, c2;
9605 c1 = PyUnicode_READ(kind1, data1, i);
9606 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009607
9608 if (c1 != c2)
9609 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009610 }
9611
9612 return (len1 < len2) ? -1 : (len1 != len2);
9613}
9614
9615#endif
9616
Alexander Belopolsky40018472011-02-26 01:02:56 +00009617int
9618PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9621 if (PyUnicode_READY(left) == -1 ||
9622 PyUnicode_READY(right) == -1)
9623 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009624 return unicode_compare((PyUnicodeObject *)left,
9625 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009627 PyErr_Format(PyExc_TypeError,
9628 "Can't compare %.100s and %.100s",
9629 left->ob_type->tp_name,
9630 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631 return -1;
9632}
9633
Martin v. Löwis5b222132007-06-10 09:51:05 +00009634int
9635PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 Py_ssize_t i;
9638 int kind;
9639 void *data;
9640 Py_UCS4 chr;
9641
Victor Stinner910337b2011-10-03 03:20:16 +02009642 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 if (PyUnicode_READY(uni) == -1)
9644 return -1;
9645 kind = PyUnicode_KIND(uni);
9646 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009647 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9649 if (chr != str[i])
9650 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009651 /* This check keeps Python strings that end in '\0' from comparing equal
9652 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009655 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009657 return 0;
9658}
9659
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009660
Benjamin Peterson29060642009-01-31 22:14:21 +00009661#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009662 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009663
Alexander Belopolsky40018472011-02-26 01:02:56 +00009664PyObject *
9665PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009666{
9667 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009668
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009669 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9670 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 if (PyUnicode_READY(left) == -1 ||
9672 PyUnicode_READY(right) == -1)
9673 return NULL;
9674 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9675 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009676 if (op == Py_EQ) {
9677 Py_INCREF(Py_False);
9678 return Py_False;
9679 }
9680 if (op == Py_NE) {
9681 Py_INCREF(Py_True);
9682 return Py_True;
9683 }
9684 }
9685 if (left == right)
9686 result = 0;
9687 else
9688 result = unicode_compare((PyUnicodeObject *)left,
9689 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009690
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009691 /* Convert the return value to a Boolean */
9692 switch (op) {
9693 case Py_EQ:
9694 v = TEST_COND(result == 0);
9695 break;
9696 case Py_NE:
9697 v = TEST_COND(result != 0);
9698 break;
9699 case Py_LE:
9700 v = TEST_COND(result <= 0);
9701 break;
9702 case Py_GE:
9703 v = TEST_COND(result >= 0);
9704 break;
9705 case Py_LT:
9706 v = TEST_COND(result == -1);
9707 break;
9708 case Py_GT:
9709 v = TEST_COND(result == 1);
9710 break;
9711 default:
9712 PyErr_BadArgument();
9713 return NULL;
9714 }
9715 Py_INCREF(v);
9716 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009718
Brian Curtindfc80e32011-08-10 20:28:54 -05009719 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009720}
9721
Alexander Belopolsky40018472011-02-26 01:02:56 +00009722int
9723PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009724{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009725 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 int kind1, kind2, kind;
9727 void *buf1, *buf2;
9728 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009729 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009730
9731 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009732 sub = PyUnicode_FromObject(element);
9733 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009734 PyErr_Format(PyExc_TypeError,
9735 "'in <string>' requires string as left operand, not %s",
9736 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009737 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (PyUnicode_READY(sub) == -1)
9740 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009741
Thomas Wouters477c8d52006-05-27 19:21:47 +00009742 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009743 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009744 Py_DECREF(sub);
9745 return -1;
9746 }
9747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 kind1 = PyUnicode_KIND(str);
9749 kind2 = PyUnicode_KIND(sub);
9750 kind = kind1 > kind2 ? kind1 : kind2;
9751 buf1 = PyUnicode_DATA(str);
9752 buf2 = PyUnicode_DATA(sub);
9753 if (kind1 != kind)
9754 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9755 if (!buf1) {
9756 Py_DECREF(sub);
9757 return -1;
9758 }
9759 if (kind2 != kind)
9760 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9761 if (!buf2) {
9762 Py_DECREF(sub);
9763 if (kind1 != kind) PyMem_Free(buf1);
9764 return -1;
9765 }
9766 len1 = PyUnicode_GET_LENGTH(str);
9767 len2 = PyUnicode_GET_LENGTH(sub);
9768
9769 switch(kind) {
9770 case PyUnicode_1BYTE_KIND:
9771 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9772 break;
9773 case PyUnicode_2BYTE_KIND:
9774 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9775 break;
9776 case PyUnicode_4BYTE_KIND:
9777 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9778 break;
9779 default:
9780 result = -1;
9781 assert(0);
9782 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009783
9784 Py_DECREF(str);
9785 Py_DECREF(sub);
9786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 if (kind1 != kind)
9788 PyMem_Free(buf1);
9789 if (kind2 != kind)
9790 PyMem_Free(buf2);
9791
Guido van Rossum403d68b2000-03-13 15:55:09 +00009792 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009793}
9794
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795/* Concat to string or Unicode object giving a new Unicode object. */
9796
Alexander Belopolsky40018472011-02-26 01:02:56 +00009797PyObject *
9798PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 PyObject *u = NULL, *v = NULL, *w;
9801 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802
9803 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009806 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810
9811 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009812 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009813 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009816 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009817 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 }
9820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009822 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 w = PyUnicode_New(
9826 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9827 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009829 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009830 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9831 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009832 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009833 v, 0,
9834 PyUnicode_GET_LENGTH(v)) < 0)
9835 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 Py_DECREF(u);
9837 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841 Py_XDECREF(u);
9842 Py_XDECREF(v);
9843 return NULL;
9844}
9845
Walter Dörwald1ab83302007-05-18 17:15:44 +00009846void
Victor Stinner23e56682011-10-03 03:54:37 +02009847PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009848{
Victor Stinner23e56682011-10-03 03:54:37 +02009849 PyObject *left, *res;
9850
9851 if (p_left == NULL) {
9852 if (!PyErr_Occurred())
9853 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 return;
9855 }
Victor Stinner23e56682011-10-03 03:54:37 +02009856 left = *p_left;
9857 if (right == NULL || !PyUnicode_Check(left)) {
9858 if (!PyErr_Occurred())
9859 PyErr_BadInternalCall();
9860 goto error;
9861 }
9862
9863 if (PyUnicode_CheckExact(left) && left != unicode_empty
9864 && PyUnicode_CheckExact(right) && right != unicode_empty
9865 && unicode_resizable(left)
9866 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9867 || _PyUnicode_WSTR(left) != NULL))
9868 {
9869 Py_ssize_t u_len, v_len, new_len, copied;
9870
9871 /* FIXME: don't make wstr string ready */
9872 if (PyUnicode_READY(left))
9873 goto error;
9874 if (PyUnicode_READY(right))
9875 goto error;
9876
9877 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9878 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9879 {
9880 u_len = PyUnicode_GET_LENGTH(left);
9881 v_len = PyUnicode_GET_LENGTH(right);
9882 if (u_len > PY_SSIZE_T_MAX - v_len) {
9883 PyErr_SetString(PyExc_OverflowError,
9884 "strings are too large to concat");
9885 goto error;
9886 }
9887 new_len = u_len + v_len;
9888
9889 /* Now we own the last reference to 'left', so we can resize it
9890 * in-place.
9891 */
9892 if (unicode_resize(&left, new_len) != 0) {
9893 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9894 * deallocated so it cannot be put back into
9895 * 'variable'. The MemoryError is raised when there
9896 * is no value in 'variable', which might (very
9897 * remotely) be a cause of incompatibilities.
9898 */
9899 goto error;
9900 }
9901 /* copy 'right' into the newly allocated area of 'left' */
9902 copied = PyUnicode_CopyCharacters(left, u_len,
9903 right, 0,
9904 v_len);
9905 assert(0 <= copied);
9906 *p_left = left;
9907 return;
9908 }
9909 }
9910
9911 res = PyUnicode_Concat(left, right);
9912 if (res == NULL)
9913 goto error;
9914 Py_DECREF(left);
9915 *p_left = res;
9916 return;
9917
9918error:
9919 Py_DECREF(*p_left);
9920 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009921}
9922
9923void
9924PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9925{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 PyUnicode_Append(pleft, right);
9927 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009928}
9929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009930PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009933Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009934string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009935interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
9937static PyObject *
9938unicode_count(PyUnicodeObject *self, PyObject *args)
9939{
9940 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009941 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009942 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 int kind1, kind2, kind;
9945 void *buf1, *buf2;
9946 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947
Jesus Ceaac451502011-04-20 17:09:23 +02009948 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9949 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009950 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 kind1 = PyUnicode_KIND(self);
9953 kind2 = PyUnicode_KIND(substring);
9954 kind = kind1 > kind2 ? kind1 : kind2;
9955 buf1 = PyUnicode_DATA(self);
9956 buf2 = PyUnicode_DATA(substring);
9957 if (kind1 != kind)
9958 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9959 if (!buf1) {
9960 Py_DECREF(substring);
9961 return NULL;
9962 }
9963 if (kind2 != kind)
9964 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9965 if (!buf2) {
9966 Py_DECREF(substring);
9967 if (kind1 != kind) PyMem_Free(buf1);
9968 return NULL;
9969 }
9970 len1 = PyUnicode_GET_LENGTH(self);
9971 len2 = PyUnicode_GET_LENGTH(substring);
9972
9973 ADJUST_INDICES(start, end, len1);
9974 switch(kind) {
9975 case PyUnicode_1BYTE_KIND:
9976 iresult = ucs1lib_count(
9977 ((Py_UCS1*)buf1) + start, end - start,
9978 buf2, len2, PY_SSIZE_T_MAX
9979 );
9980 break;
9981 case PyUnicode_2BYTE_KIND:
9982 iresult = ucs2lib_count(
9983 ((Py_UCS2*)buf1) + start, end - start,
9984 buf2, len2, PY_SSIZE_T_MAX
9985 );
9986 break;
9987 case PyUnicode_4BYTE_KIND:
9988 iresult = ucs4lib_count(
9989 ((Py_UCS4*)buf1) + start, end - start,
9990 buf2, len2, PY_SSIZE_T_MAX
9991 );
9992 break;
9993 default:
9994 assert(0); iresult = 0;
9995 }
9996
9997 result = PyLong_FromSsize_t(iresult);
9998
9999 if (kind1 != kind)
10000 PyMem_Free(buf1);
10001 if (kind2 != kind)
10002 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003
10004 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006 return result;
10007}
10008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010009PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010010 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010012Encode S using the codec registered for encoding. Default encoding\n\
10013is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010014handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010015a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10016'xmlcharrefreplace' as well as any other name registered with\n\
10017codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018
10019static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010020unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010022 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023 char *encoding = NULL;
10024 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010025
Benjamin Peterson308d6372009-09-18 21:42:35 +000010026 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10027 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010029 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010030}
10031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010032PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034\n\
10035Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010036If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037
10038static PyObject*
10039unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10040{
10041 Py_UNICODE *e;
10042 Py_UNICODE *p;
10043 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010044 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046 PyUnicodeObject *u;
10047 int tabsize = 8;
10048
10049 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10053 return NULL;
10054
Thomas Wouters7e474022000-07-16 12:04:32 +000010055 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010056 i = 0; /* chars up to and including most recent \n or \r */
10057 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10059 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 if (tabsize > 0) {
10062 incr = tabsize - (j % tabsize); /* cannot overflow */
10063 if (j > PY_SSIZE_T_MAX - incr)
10064 goto overflow1;
10065 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010066 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010069 if (j > PY_SSIZE_T_MAX - 1)
10070 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071 j++;
10072 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 if (i > PY_SSIZE_T_MAX - j)
10074 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010076 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010077 }
10078 }
10079
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010080 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010082
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083 /* Second pass: create output string and fill it */
10084 u = _PyUnicode_New(i + j);
10085 if (!u)
10086 return NULL;
10087
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010088 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 q = _PyUnicode_WSTR(u); /* next output char */
10090 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010094 if (tabsize > 0) {
10095 i = tabsize - (j % tabsize);
10096 j += i;
10097 while (i--) {
10098 if (q >= qe)
10099 goto overflow2;
10100 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010101 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010103 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 else {
10105 if (q >= qe)
10106 goto overflow2;
10107 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010108 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109 if (*p == '\n' || *p == '\r')
10110 j = 0;
10111 }
10112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (PyUnicode_READY(u) == -1) {
10114 Py_DECREF(u);
10115 return NULL;
10116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010118
10119 overflow2:
10120 Py_DECREF(u);
10121 overflow1:
10122 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124}
10125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010126PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010127 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128\n\
10129Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010130such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131arguments start and end are interpreted as in slice notation.\n\
10132\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010133Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
10135static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Jesus Ceaac451502011-04-20 17:09:23 +020010138 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010139 Py_ssize_t start;
10140 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010141 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
Jesus Ceaac451502011-04-20 17:09:23 +020010143 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10144 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 if (PyUnicode_READY(self) == -1)
10148 return NULL;
10149 if (PyUnicode_READY(substring) == -1)
10150 return NULL;
10151
10152 result = any_find_slice(
10153 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10154 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010155 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156
10157 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (result == -2)
10160 return NULL;
10161
Christian Heimes217cfd12007-12-02 14:31:20 +000010162 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163}
10164
10165static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010166unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010168 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10169 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172}
10173
Guido van Rossumc2504932007-09-18 19:42:40 +000010174/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010175 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010176static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010177unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178{
Guido van Rossumc2504932007-09-18 19:42:40 +000010179 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010180 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (_PyUnicode_HASH(self) != -1)
10183 return _PyUnicode_HASH(self);
10184 if (PyUnicode_READY(self) == -1)
10185 return -1;
10186 len = PyUnicode_GET_LENGTH(self);
10187
10188 /* The hash function as a macro, gets expanded three times below. */
10189#define HASH(P) \
10190 x = (Py_uhash_t)*P << 7; \
10191 while (--len >= 0) \
10192 x = (1000003*x) ^ (Py_uhash_t)*P++;
10193
10194 switch (PyUnicode_KIND(self)) {
10195 case PyUnicode_1BYTE_KIND: {
10196 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10197 HASH(c);
10198 break;
10199 }
10200 case PyUnicode_2BYTE_KIND: {
10201 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10202 HASH(s);
10203 break;
10204 }
10205 default: {
10206 Py_UCS4 *l;
10207 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10208 "Impossible switch case in unicode_hash");
10209 l = PyUnicode_4BYTE_DATA(self);
10210 HASH(l);
10211 break;
10212 }
10213 }
10214 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10215
Guido van Rossumc2504932007-09-18 19:42:40 +000010216 if (x == -1)
10217 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010219 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010223PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010226Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
10228static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010231 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010232 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010233 Py_ssize_t start;
10234 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
Jesus Ceaac451502011-04-20 17:09:23 +020010236 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10237 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (PyUnicode_READY(self) == -1)
10241 return NULL;
10242 if (PyUnicode_READY(substring) == -1)
10243 return NULL;
10244
10245 result = any_find_slice(
10246 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10247 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249
10250 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (result == -2)
10253 return NULL;
10254
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255 if (result < 0) {
10256 PyErr_SetString(PyExc_ValueError, "substring not found");
10257 return NULL;
10258 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259
Christian Heimes217cfd12007-12-02 14:31:20 +000010260 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261}
10262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010263PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010266Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010267at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268
10269static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010270unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 Py_ssize_t i, length;
10273 int kind;
10274 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 int cased;
10276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (PyUnicode_READY(self) == -1)
10278 return NULL;
10279 length = PyUnicode_GET_LENGTH(self);
10280 kind = PyUnicode_KIND(self);
10281 data = PyUnicode_DATA(self);
10282
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (length == 1)
10285 return PyBool_FromLong(
10286 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010288 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010291
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 for (i = 0; i < length; i++) {
10294 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010295
Benjamin Peterson29060642009-01-31 22:14:21 +000010296 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10297 return PyBool_FromLong(0);
10298 else if (!cased && Py_UNICODE_ISLOWER(ch))
10299 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010301 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302}
10303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010305 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010307Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
10310static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010311unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 Py_ssize_t i, length;
10314 int kind;
10315 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 int cased;
10317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (PyUnicode_READY(self) == -1)
10319 return NULL;
10320 length = PyUnicode_GET_LENGTH(self);
10321 kind = PyUnicode_KIND(self);
10322 data = PyUnicode_DATA(self);
10323
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (length == 1)
10326 return PyBool_FromLong(
10327 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010329 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010331 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010332
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 for (i = 0; i < length; i++) {
10335 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010336
Benjamin Peterson29060642009-01-31 22:14:21 +000010337 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10338 return PyBool_FromLong(0);
10339 else if (!cased && Py_UNICODE_ISUPPER(ch))
10340 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010342 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343}
10344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010345PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010346 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010348Return True if S is a titlecased string and there is at least one\n\
10349character in S, i.e. upper- and titlecase characters may only\n\
10350follow uncased characters and lowercase characters only cased ones.\n\
10351Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352
10353static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010354unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 Py_ssize_t i, length;
10357 int kind;
10358 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359 int cased, previous_is_cased;
10360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (PyUnicode_READY(self) == -1)
10362 return NULL;
10363 length = PyUnicode_GET_LENGTH(self);
10364 kind = PyUnicode_KIND(self);
10365 data = PyUnicode_DATA(self);
10366
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (length == 1) {
10369 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10370 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10371 (Py_UNICODE_ISUPPER(ch) != 0));
10372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010374 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010376 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010377
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378 cased = 0;
10379 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 for (i = 0; i < length; i++) {
10381 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010382
Benjamin Peterson29060642009-01-31 22:14:21 +000010383 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10384 if (previous_is_cased)
10385 return PyBool_FromLong(0);
10386 previous_is_cased = 1;
10387 cased = 1;
10388 }
10389 else if (Py_UNICODE_ISLOWER(ch)) {
10390 if (!previous_is_cased)
10391 return PyBool_FromLong(0);
10392 previous_is_cased = 1;
10393 cased = 1;
10394 }
10395 else
10396 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010398 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399}
10400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010401PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010402 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010404Return True if all characters in S are whitespace\n\
10405and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
10407static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010408unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 Py_ssize_t i, length;
10411 int kind;
10412 void *data;
10413
10414 if (PyUnicode_READY(self) == -1)
10415 return NULL;
10416 length = PyUnicode_GET_LENGTH(self);
10417 kind = PyUnicode_KIND(self);
10418 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (length == 1)
10422 return PyBool_FromLong(
10423 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010425 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010427 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 for (i = 0; i < length; i++) {
10430 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010431 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010434 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435}
10436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010437PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010438 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010439\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010440Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010441and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010442
10443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010444unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 Py_ssize_t i, length;
10447 int kind;
10448 void *data;
10449
10450 if (PyUnicode_READY(self) == -1)
10451 return NULL;
10452 length = PyUnicode_GET_LENGTH(self);
10453 kind = PyUnicode_KIND(self);
10454 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010455
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010456 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 if (length == 1)
10458 return PyBool_FromLong(
10459 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010460
10461 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 for (i = 0; i < length; i++) {
10466 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010469 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010470}
10471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010472PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010475Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010476and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010477
10478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010479unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 int kind;
10482 void *data;
10483 Py_ssize_t len, i;
10484
10485 if (PyUnicode_READY(self) == -1)
10486 return NULL;
10487
10488 kind = PyUnicode_KIND(self);
10489 data = PyUnicode_DATA(self);
10490 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010491
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010492 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (len == 1) {
10494 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10495 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10496 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010497
10498 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010500 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 for (i = 0; i < len; i++) {
10503 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010504 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010506 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010507 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010508}
10509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010510PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010511 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010513Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010514False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515
10516static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010517unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 Py_ssize_t i, length;
10520 int kind;
10521 void *data;
10522
10523 if (PyUnicode_READY(self) == -1)
10524 return NULL;
10525 length = PyUnicode_GET_LENGTH(self);
10526 kind = PyUnicode_KIND(self);
10527 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (length == 1)
10531 return PyBool_FromLong(
10532 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010534 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 for (i = 0; i < length; i++) {
10539 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010542 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543}
10544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010545PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010548Return True if all characters in S are digits\n\
10549and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550
10551static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010552unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 Py_ssize_t i, length;
10555 int kind;
10556 void *data;
10557
10558 if (PyUnicode_READY(self) == -1)
10559 return NULL;
10560 length = PyUnicode_GET_LENGTH(self);
10561 kind = PyUnicode_KIND(self);
10562 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (length == 1) {
10566 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10567 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010570 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010572 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 for (i = 0; i < length; i++) {
10575 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010578 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579}
10580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010581PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010584Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010585False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
10587static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010588unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 Py_ssize_t i, length;
10591 int kind;
10592 void *data;
10593
10594 if (PyUnicode_READY(self) == -1)
10595 return NULL;
10596 length = PyUnicode_GET_LENGTH(self);
10597 kind = PyUnicode_KIND(self);
10598 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (length == 1)
10602 return PyBool_FromLong(
10603 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010605 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010607 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 for (i = 0; i < length; i++) {
10610 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010613 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614}
10615
Martin v. Löwis47383402007-08-15 07:32:56 +000010616int
10617PyUnicode_IsIdentifier(PyObject *self)
10618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 int kind;
10620 void *data;
10621 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010622 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 if (PyUnicode_READY(self) == -1) {
10625 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 }
10628
10629 /* Special case for empty strings */
10630 if (PyUnicode_GET_LENGTH(self) == 0)
10631 return 0;
10632 kind = PyUnicode_KIND(self);
10633 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010634
10635 /* PEP 3131 says that the first character must be in
10636 XID_Start and subsequent characters in XID_Continue,
10637 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010638 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010639 letters, digits, underscore). However, given the current
10640 definition of XID_Start and XID_Continue, it is sufficient
10641 to check just for these, except that _ must be allowed
10642 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010644 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010645 return 0;
10646
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010647 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010650 return 1;
10651}
10652
10653PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010655\n\
10656Return True if S is a valid identifier according\n\
10657to the language definition.");
10658
10659static PyObject*
10660unicode_isidentifier(PyObject *self)
10661{
10662 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10663}
10664
Georg Brandl559e5d72008-06-11 18:37:52 +000010665PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010666 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010667\n\
10668Return True if all characters in S are considered\n\
10669printable in repr() or S is empty, False otherwise.");
10670
10671static PyObject*
10672unicode_isprintable(PyObject *self)
10673{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 Py_ssize_t i, length;
10675 int kind;
10676 void *data;
10677
10678 if (PyUnicode_READY(self) == -1)
10679 return NULL;
10680 length = PyUnicode_GET_LENGTH(self);
10681 kind = PyUnicode_KIND(self);
10682 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010683
10684 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (length == 1)
10686 return PyBool_FromLong(
10687 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 for (i = 0; i < length; i++) {
10690 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010691 Py_RETURN_FALSE;
10692 }
10693 }
10694 Py_RETURN_TRUE;
10695}
10696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010697PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010698 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699\n\
10700Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010701iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702
10703static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010704unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010706 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707}
10708
Martin v. Löwis18e16552006-02-15 17:27:45 +000010709static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710unicode_length(PyUnicodeObject *self)
10711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 if (PyUnicode_READY(self) == -1)
10713 return -1;
10714 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715}
10716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010717PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010720Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010721done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723static PyObject *
10724unicode_ljust(PyUnicodeObject *self, PyObject *args)
10725{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010726 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 Py_UCS4 fillchar = ' ';
10728
10729 if (PyUnicode_READY(self) == -1)
10730 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010731
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010732 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733 return NULL;
10734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736 Py_INCREF(self);
10737 return (PyObject*) self;
10738 }
10739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741}
10742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010743PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010746Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
10748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010749unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 return fixup(self, fixlower);
10752}
10753
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010754#define LEFTSTRIP 0
10755#define RIGHTSTRIP 1
10756#define BOTHSTRIP 2
10757
10758/* Arrays indexed by above */
10759static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10760
10761#define STRIPNAME(i) (stripformat[i]+3)
10762
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010763/* externally visible for str.strip(unicode) */
10764PyObject *
10765_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 void *data;
10768 int kind;
10769 Py_ssize_t i, j, len;
10770 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10773 return NULL;
10774
10775 kind = PyUnicode_KIND(self);
10776 data = PyUnicode_DATA(self);
10777 len = PyUnicode_GET_LENGTH(self);
10778 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10779 PyUnicode_DATA(sepobj),
10780 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010781
Benjamin Peterson14339b62009-01-31 16:36:08 +000010782 i = 0;
10783 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 while (i < len &&
10785 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 i++;
10787 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010788 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010789
Benjamin Peterson14339b62009-01-31 16:36:08 +000010790 j = len;
10791 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 do {
10793 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 } while (j >= i &&
10795 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010797 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010798
Victor Stinner12bab6d2011-10-01 01:53:49 +020010799 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800}
10801
10802PyObject*
10803PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10804{
10805 unsigned char *data;
10806 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010807 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808
Victor Stinnerde636f32011-10-01 03:55:54 +020010809 if (PyUnicode_READY(self) == -1)
10810 return NULL;
10811
10812 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10813
Victor Stinner12bab6d2011-10-01 01:53:49 +020010814 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010816 if (PyUnicode_CheckExact(self)) {
10817 Py_INCREF(self);
10818 return self;
10819 }
10820 else
10821 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 }
10823
Victor Stinner12bab6d2011-10-01 01:53:49 +020010824 length = end - start;
10825 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010826 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827
Victor Stinnerde636f32011-10-01 03:55:54 +020010828 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010829 PyErr_SetString(PyExc_IndexError, "string index out of range");
10830 return NULL;
10831 }
10832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 kind = PyUnicode_KIND(self);
10834 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010835 return PyUnicode_FromKindAndData(kind,
10836 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010837 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010841do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int kind;
10844 void *data;
10845 Py_ssize_t len, i, j;
10846
10847 if (PyUnicode_READY(self) == -1)
10848 return NULL;
10849
10850 kind = PyUnicode_KIND(self);
10851 data = PyUnicode_DATA(self);
10852 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010853
Benjamin Peterson14339b62009-01-31 16:36:08 +000010854 i = 0;
10855 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010857 i++;
10858 }
10859 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010860
Benjamin Peterson14339b62009-01-31 16:36:08 +000010861 j = len;
10862 if (striptype != LEFTSTRIP) {
10863 do {
10864 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010866 j++;
10867 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010868
Victor Stinner12bab6d2011-10-01 01:53:49 +020010869 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870}
10871
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010872
10873static PyObject *
10874do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10875{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010876 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010877
Benjamin Peterson14339b62009-01-31 16:36:08 +000010878 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10879 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010880
Benjamin Peterson14339b62009-01-31 16:36:08 +000010881 if (sep != NULL && sep != Py_None) {
10882 if (PyUnicode_Check(sep))
10883 return _PyUnicode_XStrip(self, striptype, sep);
10884 else {
10885 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 "%s arg must be None or str",
10887 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888 return NULL;
10889 }
10890 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010891
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010893}
10894
10895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010896PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010897 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010898\n\
10899Return a copy of the string S with leading and trailing\n\
10900whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010901If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010902
10903static PyObject *
10904unicode_strip(PyUnicodeObject *self, PyObject *args)
10905{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010906 if (PyTuple_GET_SIZE(args) == 0)
10907 return do_strip(self, BOTHSTRIP); /* Common case */
10908 else
10909 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010910}
10911
10912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010913PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010915\n\
10916Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010917If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010918
10919static PyObject *
10920unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10921{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010922 if (PyTuple_GET_SIZE(args) == 0)
10923 return do_strip(self, LEFTSTRIP); /* Common case */
10924 else
10925 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010926}
10927
10928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010929PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010931\n\
10932Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010933If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010934
10935static PyObject *
10936unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10937{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010938 if (PyTuple_GET_SIZE(args) == 0)
10939 return do_strip(self, RIGHTSTRIP); /* Common case */
10940 else
10941 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942}
10943
10944
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010946unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947{
10948 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950
Georg Brandl222de0f2009-04-12 12:01:50 +000010951 if (len < 1) {
10952 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010953 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Tim Peters7a29bd52001-09-12 03:03:31 +000010956 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 /* no repeat, return original string */
10958 Py_INCREF(str);
10959 return (PyObject*) str;
10960 }
Tim Peters8f422462000-09-09 06:13:41 +000010961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 if (PyUnicode_READY(str) == -1)
10963 return NULL;
10964
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010965 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010966 PyErr_SetString(PyExc_OverflowError,
10967 "repeated string is too long");
10968 return NULL;
10969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 if (!u)
10974 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010975 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 if (PyUnicode_GET_LENGTH(str) == 1) {
10978 const int kind = PyUnicode_KIND(str);
10979 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10980 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010981 if (kind == PyUnicode_1BYTE_KIND)
10982 memset(to, (unsigned char)fill_char, len);
10983 else {
10984 for (n = 0; n < len; ++n)
10985 PyUnicode_WRITE(kind, to, n, fill_char);
10986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 }
10988 else {
10989 /* number of characters copied this far */
10990 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10991 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10992 char *to = (char *) PyUnicode_DATA(u);
10993 Py_MEMCPY(to, PyUnicode_DATA(str),
10994 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 n = (done <= nchars-done) ? done : nchars-done;
10997 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010998 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 }
11001
11002 return (PyObject*) u;
11003}
11004
Alexander Belopolsky40018472011-02-26 01:02:56 +000011005PyObject *
11006PyUnicode_Replace(PyObject *obj,
11007 PyObject *subobj,
11008 PyObject *replobj,
11009 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010{
11011 PyObject *self;
11012 PyObject *str1;
11013 PyObject *str2;
11014 PyObject *result;
11015
11016 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011017 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011020 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 Py_DECREF(self);
11022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 }
11024 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011025 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 Py_DECREF(self);
11027 Py_DECREF(str1);
11028 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 Py_DECREF(self);
11032 Py_DECREF(str1);
11033 Py_DECREF(str2);
11034 return result;
11035}
11036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011037PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011038 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039\n\
11040Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011041old replaced by new. If the optional argument count is\n\
11042given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
11044static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 PyObject *str1;
11048 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011049 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 PyObject *result;
11051
Martin v. Löwis18e16552006-02-15 17:27:45 +000011052 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056 str1 = PyUnicode_FromObject(str1);
11057 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11058 return NULL;
11059 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011060 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 Py_DECREF(str1);
11062 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065 result = replace(self, str1, str2, maxcount);
11066
11067 Py_DECREF(str1);
11068 Py_DECREF(str2);
11069 return result;
11070}
11071
Alexander Belopolsky40018472011-02-26 01:02:56 +000011072static PyObject *
11073unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011075 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 Py_ssize_t isize;
11077 Py_ssize_t osize, squote, dquote, i, o;
11078 Py_UCS4 max, quote;
11079 int ikind, okind;
11080 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011083 return NULL;
11084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 isize = PyUnicode_GET_LENGTH(unicode);
11086 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 /* Compute length of output, quote characters, and
11089 maximum character */
11090 osize = 2; /* quotes */
11091 max = 127;
11092 squote = dquote = 0;
11093 ikind = PyUnicode_KIND(unicode);
11094 for (i = 0; i < isize; i++) {
11095 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11096 switch (ch) {
11097 case '\'': squote++; osize++; break;
11098 case '"': dquote++; osize++; break;
11099 case '\\': case '\t': case '\r': case '\n':
11100 osize += 2; break;
11101 default:
11102 /* Fast-path ASCII */
11103 if (ch < ' ' || ch == 0x7f)
11104 osize += 4; /* \xHH */
11105 else if (ch < 0x7f)
11106 osize++;
11107 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11108 osize++;
11109 max = ch > max ? ch : max;
11110 }
11111 else if (ch < 0x100)
11112 osize += 4; /* \xHH */
11113 else if (ch < 0x10000)
11114 osize += 6; /* \uHHHH */
11115 else
11116 osize += 10; /* \uHHHHHHHH */
11117 }
11118 }
11119
11120 quote = '\'';
11121 if (squote) {
11122 if (dquote)
11123 /* Both squote and dquote present. Use squote,
11124 and escape them */
11125 osize += squote;
11126 else
11127 quote = '"';
11128 }
11129
11130 repr = PyUnicode_New(osize, max);
11131 if (repr == NULL)
11132 return NULL;
11133 okind = PyUnicode_KIND(repr);
11134 odata = PyUnicode_DATA(repr);
11135
11136 PyUnicode_WRITE(okind, odata, 0, quote);
11137 PyUnicode_WRITE(okind, odata, osize-1, quote);
11138
11139 for (i = 0, o = 1; i < isize; i++) {
11140 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011141
11142 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if ((ch == quote) || (ch == '\\')) {
11144 PyUnicode_WRITE(okind, odata, o++, '\\');
11145 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011146 continue;
11147 }
11148
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011150 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 PyUnicode_WRITE(okind, odata, o++, '\\');
11152 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011153 }
11154 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 PyUnicode_WRITE(okind, odata, o++, '\\');
11156 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011157 }
11158 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 PyUnicode_WRITE(okind, odata, o++, '\\');
11160 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011161 }
11162
11163 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011164 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 PyUnicode_WRITE(okind, odata, o++, '\\');
11166 PyUnicode_WRITE(okind, odata, o++, 'x');
11167 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11168 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011169 }
11170
Georg Brandl559e5d72008-06-11 18:37:52 +000011171 /* Copy ASCII characters as-is */
11172 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011174 }
11175
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011177 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011178 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011179 (categories Z* and C* except ASCII space)
11180 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011182 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 if (ch <= 0xff) {
11184 PyUnicode_WRITE(okind, odata, o++, '\\');
11185 PyUnicode_WRITE(okind, odata, o++, 'x');
11186 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11187 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011188 }
11189 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 else if (ch >= 0x10000) {
11191 PyUnicode_WRITE(okind, odata, o++, '\\');
11192 PyUnicode_WRITE(okind, odata, o++, 'U');
11193 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11194 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11195 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11196 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11197 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11198 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11199 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11200 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011201 }
11202 /* Map 16-bit characters to '\uxxxx' */
11203 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 PyUnicode_WRITE(okind, odata, o++, '\\');
11205 PyUnicode_WRITE(okind, odata, o++, 'u');
11206 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11207 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11208 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11209 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011210 }
11211 }
11212 /* Copy characters as-is */
11213 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011215 }
11216 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011219 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224\n\
11225Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011226such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227arguments start and end are interpreted as in slice notation.\n\
11228\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
11231static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233{
Jesus Ceaac451502011-04-20 17:09:23 +020011234 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011235 Py_ssize_t start;
11236 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011237 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238
Jesus Ceaac451502011-04-20 17:09:23 +020011239 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11240 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245 if (PyUnicode_READY(substring) == -1)
11246 return NULL;
11247
11248 result = any_find_slice(
11249 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11250 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011251 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
11253 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 if (result == -2)
11256 return NULL;
11257
Christian Heimes217cfd12007-12-02 14:31:20 +000011258 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259}
11260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
11266static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268{
Jesus Ceaac451502011-04-20 17:09:23 +020011269 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011270 Py_ssize_t start;
11271 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011272 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
Jesus Ceaac451502011-04-20 17:09:23 +020011274 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11275 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (PyUnicode_READY(self) == -1)
11279 return NULL;
11280 if (PyUnicode_READY(substring) == -1)
11281 return NULL;
11282
11283 result = any_find_slice(
11284 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11285 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011286 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (result == -2)
11291 return NULL;
11292
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 if (result < 0) {
11294 PyErr_SetString(PyExc_ValueError, "substring not found");
11295 return NULL;
11296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297
Christian Heimes217cfd12007-12-02 14:31:20 +000011298 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299}
11300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011301PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011304Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011305done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
11307static PyObject *
11308unicode_rjust(PyUnicodeObject *self, PyObject *args)
11309{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011310 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 Py_UCS4 fillchar = ' ';
11312
Victor Stinnere9a29352011-10-01 02:14:59 +020011313 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011315
Victor Stinnere9a29352011-10-01 02:14:59 +020011316 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 return NULL;
11318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 Py_INCREF(self);
11321 return (PyObject*) self;
11322 }
11323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325}
11326
Alexander Belopolsky40018472011-02-26 01:02:56 +000011327PyObject *
11328PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329{
11330 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011331
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 s = PyUnicode_FromObject(s);
11333 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011334 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 if (sep != NULL) {
11336 sep = PyUnicode_FromObject(sep);
11337 if (sep == NULL) {
11338 Py_DECREF(s);
11339 return NULL;
11340 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341 }
11342
11343 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11344
11345 Py_DECREF(s);
11346 Py_XDECREF(sep);
11347 return result;
11348}
11349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352\n\
11353Return a list of the words in S, using sep as the\n\
11354delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011355splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011356whitespace string is a separator and empty strings are\n\
11357removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
11359static PyObject*
11360unicode_split(PyUnicodeObject *self, PyObject *args)
11361{
11362 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011363 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364
Martin v. Löwis18e16552006-02-15 17:27:45 +000011365 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366 return NULL;
11367
11368 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374}
11375
Thomas Wouters477c8d52006-05-27 19:21:47 +000011376PyObject *
11377PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11378{
11379 PyObject* str_obj;
11380 PyObject* sep_obj;
11381 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 int kind1, kind2, kind;
11383 void *buf1 = NULL, *buf2 = NULL;
11384 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011385
11386 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011387 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011389 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011391 Py_DECREF(str_obj);
11392 return NULL;
11393 }
11394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 kind1 = PyUnicode_KIND(str_in);
11396 kind2 = PyUnicode_KIND(sep_obj);
11397 kind = kind1 > kind2 ? kind1 : kind2;
11398 buf1 = PyUnicode_DATA(str_in);
11399 if (kind1 != kind)
11400 buf1 = _PyUnicode_AsKind(str_in, kind);
11401 if (!buf1)
11402 goto onError;
11403 buf2 = PyUnicode_DATA(sep_obj);
11404 if (kind2 != kind)
11405 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11406 if (!buf2)
11407 goto onError;
11408 len1 = PyUnicode_GET_LENGTH(str_obj);
11409 len2 = PyUnicode_GET_LENGTH(sep_obj);
11410
11411 switch(PyUnicode_KIND(str_in)) {
11412 case PyUnicode_1BYTE_KIND:
11413 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11414 break;
11415 case PyUnicode_2BYTE_KIND:
11416 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11417 break;
11418 case PyUnicode_4BYTE_KIND:
11419 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11420 break;
11421 default:
11422 assert(0);
11423 out = 0;
11424 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011425
11426 Py_DECREF(sep_obj);
11427 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (kind1 != kind)
11429 PyMem_Free(buf1);
11430 if (kind2 != kind)
11431 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432
11433 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 onError:
11435 Py_DECREF(sep_obj);
11436 Py_DECREF(str_obj);
11437 if (kind1 != kind && buf1)
11438 PyMem_Free(buf1);
11439 if (kind2 != kind && buf2)
11440 PyMem_Free(buf2);
11441 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011442}
11443
11444
11445PyObject *
11446PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11447{
11448 PyObject* str_obj;
11449 PyObject* sep_obj;
11450 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 int kind1, kind2, kind;
11452 void *buf1 = NULL, *buf2 = NULL;
11453 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011454
11455 str_obj = PyUnicode_FromObject(str_in);
11456 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458 sep_obj = PyUnicode_FromObject(sep_in);
11459 if (!sep_obj) {
11460 Py_DECREF(str_obj);
11461 return NULL;
11462 }
11463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 kind1 = PyUnicode_KIND(str_in);
11465 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011466 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 buf1 = PyUnicode_DATA(str_in);
11468 if (kind1 != kind)
11469 buf1 = _PyUnicode_AsKind(str_in, kind);
11470 if (!buf1)
11471 goto onError;
11472 buf2 = PyUnicode_DATA(sep_obj);
11473 if (kind2 != kind)
11474 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11475 if (!buf2)
11476 goto onError;
11477 len1 = PyUnicode_GET_LENGTH(str_obj);
11478 len2 = PyUnicode_GET_LENGTH(sep_obj);
11479
11480 switch(PyUnicode_KIND(str_in)) {
11481 case PyUnicode_1BYTE_KIND:
11482 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11483 break;
11484 case PyUnicode_2BYTE_KIND:
11485 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11486 break;
11487 case PyUnicode_4BYTE_KIND:
11488 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11489 break;
11490 default:
11491 assert(0);
11492 out = 0;
11493 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011494
11495 Py_DECREF(sep_obj);
11496 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (kind1 != kind)
11498 PyMem_Free(buf1);
11499 if (kind2 != kind)
11500 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501
11502 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 onError:
11504 Py_DECREF(sep_obj);
11505 Py_DECREF(str_obj);
11506 if (kind1 != kind && buf1)
11507 PyMem_Free(buf1);
11508 if (kind2 != kind && buf2)
11509 PyMem_Free(buf2);
11510 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011511}
11512
11513PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011516Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011517the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011518found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519
11520static PyObject*
11521unicode_partition(PyUnicodeObject *self, PyObject *separator)
11522{
11523 return PyUnicode_Partition((PyObject *)self, separator);
11524}
11525
11526PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011527 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011528\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011529Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011530the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011531separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011532
11533static PyObject*
11534unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11535{
11536 return PyUnicode_RPartition((PyObject *)self, separator);
11537}
11538
Alexander Belopolsky40018472011-02-26 01:02:56 +000011539PyObject *
11540PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011541{
11542 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011543
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011544 s = PyUnicode_FromObject(s);
11545 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011546 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 if (sep != NULL) {
11548 sep = PyUnicode_FromObject(sep);
11549 if (sep == NULL) {
11550 Py_DECREF(s);
11551 return NULL;
11552 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011553 }
11554
11555 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11556
11557 Py_DECREF(s);
11558 Py_XDECREF(sep);
11559 return result;
11560}
11561
11562PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011564\n\
11565Return a list of the words in S, using sep as the\n\
11566delimiter string, starting at the end of the string and\n\
11567working to the front. If maxsplit is given, at most maxsplit\n\
11568splits are done. If sep is not specified, any whitespace string\n\
11569is a separator.");
11570
11571static PyObject*
11572unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11573{
11574 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011575 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011576
Martin v. Löwis18e16552006-02-15 17:27:45 +000011577 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011578 return NULL;
11579
11580 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011582 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011584 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011586}
11587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011588PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590\n\
11591Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011592Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
11595static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011596unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011598 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011599 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011601 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11602 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603 return NULL;
11604
Guido van Rossum86662912000-04-11 15:38:46 +000011605 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
11608static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011609PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610{
Walter Dörwald346737f2007-05-31 10:44:43 +000011611 if (PyUnicode_CheckExact(self)) {
11612 Py_INCREF(self);
11613 return self;
11614 } else
11615 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011616 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617}
11618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011619PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621\n\
11622Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
11625static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011626unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628 return fixup(self, fixswapcase);
11629}
11630
Georg Brandlceee0772007-11-27 23:48:05 +000011631PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011633\n\
11634Return a translation table usable for str.translate().\n\
11635If there is only one argument, it must be a dictionary mapping Unicode\n\
11636ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011637Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011638If there are two arguments, they must be strings of equal length, and\n\
11639in the resulting dictionary, each character in x will be mapped to the\n\
11640character at the same position in y. If there is a third argument, it\n\
11641must be a string, whose characters will be mapped to None in the result.");
11642
11643static PyObject*
11644unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11645{
11646 PyObject *x, *y = NULL, *z = NULL;
11647 PyObject *new = NULL, *key, *value;
11648 Py_ssize_t i = 0;
11649 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011650
Georg Brandlceee0772007-11-27 23:48:05 +000011651 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11652 return NULL;
11653 new = PyDict_New();
11654 if (!new)
11655 return NULL;
11656 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 int x_kind, y_kind, z_kind;
11658 void *x_data, *y_data, *z_data;
11659
Georg Brandlceee0772007-11-27 23:48:05 +000011660 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011661 if (!PyUnicode_Check(x)) {
11662 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11663 "be a string if there is a second argument");
11664 goto err;
11665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011667 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11668 "arguments must have equal length");
11669 goto err;
11670 }
11671 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 x_kind = PyUnicode_KIND(x);
11673 y_kind = PyUnicode_KIND(y);
11674 x_data = PyUnicode_DATA(x);
11675 y_data = PyUnicode_DATA(y);
11676 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11677 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11678 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011679 if (!key || !value)
11680 goto err;
11681 res = PyDict_SetItem(new, key, value);
11682 Py_DECREF(key);
11683 Py_DECREF(value);
11684 if (res < 0)
11685 goto err;
11686 }
11687 /* create entries for deleting chars in z */
11688 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 z_kind = PyUnicode_KIND(z);
11690 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011691 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011693 if (!key)
11694 goto err;
11695 res = PyDict_SetItem(new, key, Py_None);
11696 Py_DECREF(key);
11697 if (res < 0)
11698 goto err;
11699 }
11700 }
11701 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 int kind;
11703 void *data;
11704
Georg Brandlceee0772007-11-27 23:48:05 +000011705 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011706 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011707 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11708 "to maketrans it must be a dict");
11709 goto err;
11710 }
11711 /* copy entries into the new dict, converting string keys to int keys */
11712 while (PyDict_Next(x, &i, &key, &value)) {
11713 if (PyUnicode_Check(key)) {
11714 /* convert string keys to integer keys */
11715 PyObject *newkey;
11716 if (PyUnicode_GET_SIZE(key) != 1) {
11717 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11718 "table must be of length 1");
11719 goto err;
11720 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 kind = PyUnicode_KIND(key);
11722 data = PyUnicode_DATA(key);
11723 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011724 if (!newkey)
11725 goto err;
11726 res = PyDict_SetItem(new, newkey, value);
11727 Py_DECREF(newkey);
11728 if (res < 0)
11729 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011730 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011731 /* just keep integer keys */
11732 if (PyDict_SetItem(new, key, value) < 0)
11733 goto err;
11734 } else {
11735 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11736 "be strings or integers");
11737 goto err;
11738 }
11739 }
11740 }
11741 return new;
11742 err:
11743 Py_DECREF(new);
11744 return NULL;
11745}
11746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011747PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749\n\
11750Return a copy of the string S, where all characters have been mapped\n\
11751through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011752Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011753Unmapped characters are left untouched. Characters mapped to None\n\
11754are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
11756static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760}
11761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011768unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 return fixup(self, fixupper);
11771}
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011776Pad a numeric string S with zeros on the left, to fill a field\n\
11777of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
11779static PyObject *
11780unicode_zfill(PyUnicodeObject *self, PyObject *args)
11781{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011782 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011784 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 int kind;
11786 void *data;
11787 Py_UCS4 chr;
11788
11789 if (PyUnicode_READY(self) == -1)
11790 return NULL;
11791
Martin v. Löwis18e16552006-02-15 17:27:45 +000011792 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 return NULL;
11794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011796 if (PyUnicode_CheckExact(self)) {
11797 Py_INCREF(self);
11798 return (PyObject*) self;
11799 }
11800 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011801 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 }
11803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805
11806 u = pad(self, fill, 0, '0');
11807
Walter Dörwald068325e2002-04-15 13:36:47 +000011808 if (u == NULL)
11809 return NULL;
11810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 kind = PyUnicode_KIND(u);
11812 data = PyUnicode_DATA(u);
11813 chr = PyUnicode_READ(kind, data, fill);
11814
11815 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 PyUnicode_WRITE(kind, data, 0, chr);
11818 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 }
11820
11821 return (PyObject*) u;
11822}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
11824#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011825static PyObject *
11826unicode__decimal2ascii(PyObject *self)
11827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011829}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830#endif
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011835Return True if S starts with the specified prefix, False otherwise.\n\
11836With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011837With optional end, stop comparing S at that position.\n\
11838prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
11840static PyObject *
11841unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011844 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011846 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011847 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011848 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
Jesus Ceaac451502011-04-20 17:09:23 +020011850 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011852 if (PyTuple_Check(subobj)) {
11853 Py_ssize_t i;
11854 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11855 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011857 if (substring == NULL)
11858 return NULL;
11859 result = tailmatch(self, substring, start, end, -1);
11860 Py_DECREF(substring);
11861 if (result) {
11862 Py_RETURN_TRUE;
11863 }
11864 }
11865 /* nothing matched */
11866 Py_RETURN_FALSE;
11867 }
11868 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011869 if (substring == NULL) {
11870 if (PyErr_ExceptionMatches(PyExc_TypeError))
11871 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11872 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011874 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011875 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011877 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878}
11879
11880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011881PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011884Return True if S ends with the specified suffix, False otherwise.\n\
11885With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011886With optional end, stop comparing S at that position.\n\
11887suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
11889static PyObject *
11890unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011893 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011895 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011896 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011897 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
Jesus Ceaac451502011-04-20 17:09:23 +020011899 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011901 if (PyTuple_Check(subobj)) {
11902 Py_ssize_t i;
11903 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11904 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011906 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011908 result = tailmatch(self, substring, start, end, +1);
11909 Py_DECREF(substring);
11910 if (result) {
11911 Py_RETURN_TRUE;
11912 }
11913 }
11914 Py_RETURN_FALSE;
11915 }
11916 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011917 if (substring == NULL) {
11918 if (PyErr_ExceptionMatches(PyExc_TypeError))
11919 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11920 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011922 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011923 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011925 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926}
11927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011929
11930PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011932\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011933Return a formatted version of S, using substitutions from args and kwargs.\n\
11934The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011935
Eric Smith27bbca62010-11-04 17:06:58 +000011936PyDoc_STRVAR(format_map__doc__,
11937 "S.format_map(mapping) -> str\n\
11938\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011939Return a formatted version of S, using substitutions from mapping.\n\
11940The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011941
Eric Smith4a7d76d2008-05-30 18:10:19 +000011942static PyObject *
11943unicode__format__(PyObject* self, PyObject* args)
11944{
11945 PyObject *format_spec;
11946
11947 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11948 return NULL;
11949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11951 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011952}
11953
Eric Smith8c663262007-08-25 02:26:07 +000011954PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011956\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011957Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011958
11959static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011960unicode__sizeof__(PyUnicodeObject *v)
11961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 Py_ssize_t size;
11963
11964 /* If it's a compact object, account for base structure +
11965 character data. */
11966 if (PyUnicode_IS_COMPACT_ASCII(v))
11967 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11968 else if (PyUnicode_IS_COMPACT(v))
11969 size = sizeof(PyCompactUnicodeObject) +
11970 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11971 else {
11972 /* If it is a two-block object, account for base object, and
11973 for character block if present. */
11974 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011975 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 size += (PyUnicode_GET_LENGTH(v) + 1) *
11977 PyUnicode_CHARACTER_SIZE(v);
11978 }
11979 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011980 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011982 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011984 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011985 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986
11987 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011988}
11989
11990PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011992
11993static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011994unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011995{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011996 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (!copy)
11998 return NULL;
11999 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012000}
12001
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002static PyMethodDef unicode_methods[] = {
12003
12004 /* Order is according to common usage: often used methods should
12005 appear first, since lookup is done sequentially. */
12006
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012007 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012008 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12009 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012010 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012011 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12012 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12013 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12014 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12015 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12016 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12017 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012018 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012019 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12020 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12021 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012022 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012023 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12024 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12025 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012026 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012027 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012028 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012029 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012030 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12031 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12032 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12033 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12034 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12035 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12036 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12037 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12038 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12039 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12040 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12041 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12042 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12043 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012044 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012045 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012046 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012047 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012048 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012049 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012050 {"maketrans", (PyCFunction) unicode_maketrans,
12051 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012052 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012053#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012054 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055#endif
12056
12057#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012058 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012059 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060#endif
12061
Benjamin Peterson14339b62009-01-31 16:36:08 +000012062 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063 {NULL, NULL}
12064};
12065
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012066static PyObject *
12067unicode_mod(PyObject *v, PyObject *w)
12068{
Brian Curtindfc80e32011-08-10 20:28:54 -050012069 if (!PyUnicode_Check(v))
12070 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012071 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012072}
12073
12074static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012075 0, /*nb_add*/
12076 0, /*nb_subtract*/
12077 0, /*nb_multiply*/
12078 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012079};
12080
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012082 (lenfunc) unicode_length, /* sq_length */
12083 PyUnicode_Concat, /* sq_concat */
12084 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12085 (ssizeargfunc) unicode_getitem, /* sq_item */
12086 0, /* sq_slice */
12087 0, /* sq_ass_item */
12088 0, /* sq_ass_slice */
12089 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090};
12091
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012092static PyObject*
12093unicode_subscript(PyUnicodeObject* self, PyObject* item)
12094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 if (PyUnicode_READY(self) == -1)
12096 return NULL;
12097
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012098 if (PyIndex_Check(item)) {
12099 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012100 if (i == -1 && PyErr_Occurred())
12101 return NULL;
12102 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012104 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012105 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012106 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012108 Py_UNICODE* result_buf;
12109 PyObject* result;
12110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012113 return NULL;
12114 }
12115
12116 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 return PyUnicode_New(0, 0);
12118 } else if (start == 0 && step == 1 &&
12119 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012120 PyUnicode_CheckExact(self)) {
12121 Py_INCREF(self);
12122 return (PyObject *)self;
12123 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012124 return PyUnicode_Substring((PyObject*)self,
12125 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012126 } else {
12127 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012128 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12129 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012130
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 if (result_buf == NULL)
12132 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012133
12134 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12135 result_buf[i] = source_buf[cur];
12136 }
Tim Petersced69f82003-09-16 20:30:58 +000012137
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012138 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012139 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012140 return result;
12141 }
12142 } else {
12143 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12144 return NULL;
12145 }
12146}
12147
12148static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 (lenfunc)unicode_length, /* mp_length */
12150 (binaryfunc)unicode_subscript, /* mp_subscript */
12151 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012152};
12153
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155/* Helpers for PyUnicode_Format() */
12156
12157static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012158getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012160 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 (*p_argidx)++;
12163 if (arglen < 0)
12164 return args;
12165 else
12166 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167 }
12168 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170 return NULL;
12171}
12172
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012173/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012175static PyObject *
12176formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012178 char *p;
12179 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012181
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 x = PyFloat_AsDouble(v);
12183 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012184 return NULL;
12185
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012188
Eric Smith0923d1d2009-04-16 20:16:10 +000012189 p = PyOS_double_to_string(x, type, prec,
12190 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012191 if (p == NULL)
12192 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012194 PyMem_Free(p);
12195 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196}
12197
Tim Peters38fd5b62000-09-21 05:43:11 +000012198static PyObject*
12199formatlong(PyObject *val, int flags, int prec, int type)
12200{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012201 char *buf;
12202 int len;
12203 PyObject *str; /* temporary string object. */
12204 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012205
Benjamin Peterson14339b62009-01-31 16:36:08 +000012206 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12207 if (!str)
12208 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012210 Py_DECREF(str);
12211 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012212}
12213
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012216 size_t buflen,
12217 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012219 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012220 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 if (PyUnicode_GET_LENGTH(v) == 1) {
12222 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 buf[1] = '\0';
12224 return 1;
12225 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 goto onError;
12227 }
12228 else {
12229 /* Integer input truncated to a character */
12230 long x;
12231 x = PyLong_AsLong(v);
12232 if (x == -1 && PyErr_Occurred())
12233 goto onError;
12234
12235 if (x < 0 || x > 0x10ffff) {
12236 PyErr_SetString(PyExc_OverflowError,
12237 "%c arg not in range(0x110000)");
12238 return -1;
12239 }
12240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012242 buf[1] = '\0';
12243 return 1;
12244 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012245
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012247 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012249 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250}
12251
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012252/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012253 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012254*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012255#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012256
Alexander Belopolsky40018472011-02-26 01:02:56 +000012257PyObject *
12258PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 void *fmt;
12261 int fmtkind;
12262 PyObject *result;
12263 Py_UCS4 *res, *res0;
12264 Py_UCS4 max;
12265 int kind;
12266 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012270
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 PyErr_BadInternalCall();
12273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12276 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 fmt = PyUnicode_DATA(uformat);
12279 fmtkind = PyUnicode_KIND(uformat);
12280 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12281 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
12283 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12285 if (res0 == NULL) {
12286 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289
12290 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 arglen = PyTuple_Size(args);
12292 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 }
12294 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 arglen = -1;
12296 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012298 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012299 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012300 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301
12302 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 if (--rescnt < 0) {
12305 rescnt = fmtcnt + 100;
12306 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12308 if (res0 == NULL){
12309 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 }
12312 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012316 }
12317 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012318 /* Got a format specifier */
12319 int flags = 0;
12320 Py_ssize_t width = -1;
12321 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 Py_UCS4 c = '\0';
12323 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 int isnumok;
12325 PyObject *v = NULL;
12326 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 void *pbuf;
12328 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 Py_ssize_t len, len1;
12331 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 fmtpos++;
12334 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12335 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 Py_ssize_t keylen;
12337 PyObject *key;
12338 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012339
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 if (dict == NULL) {
12341 PyErr_SetString(PyExc_TypeError,
12342 "format requires a mapping");
12343 goto onError;
12344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 /* Skip over balanced parentheses */
12349 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 if (fmtcnt < 0 || pcount > 0) {
12358 PyErr_SetString(PyExc_ValueError,
12359 "incomplete format key");
12360 goto onError;
12361 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012362 key = PyUnicode_Substring((PyObject*)uformat,
12363 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 if (key == NULL)
12365 goto onError;
12366 if (args_owned) {
12367 Py_DECREF(args);
12368 args_owned = 0;
12369 }
12370 args = PyObject_GetItem(dict, key);
12371 Py_DECREF(key);
12372 if (args == NULL) {
12373 goto onError;
12374 }
12375 args_owned = 1;
12376 arglen = -1;
12377 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012378 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 case '-': flags |= F_LJUST; continue;
12382 case '+': flags |= F_SIGN; continue;
12383 case ' ': flags |= F_BLANK; continue;
12384 case '#': flags |= F_ALT; continue;
12385 case '0': flags |= F_ZERO; continue;
12386 }
12387 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 if (c == '*') {
12390 v = getnextarg(args, arglen, &argidx);
12391 if (v == NULL)
12392 goto onError;
12393 if (!PyLong_Check(v)) {
12394 PyErr_SetString(PyExc_TypeError,
12395 "* wants int");
12396 goto onError;
12397 }
12398 width = PyLong_AsLong(v);
12399 if (width == -1 && PyErr_Occurred())
12400 goto onError;
12401 if (width < 0) {
12402 flags |= F_LJUST;
12403 width = -width;
12404 }
12405 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 }
12408 else if (c >= '0' && c <= '9') {
12409 width = c - '0';
12410 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 if (c < '0' || c > '9')
12413 break;
12414 if ((width*10) / 10 != width) {
12415 PyErr_SetString(PyExc_ValueError,
12416 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012417 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 }
12419 width = width*10 + (c - '0');
12420 }
12421 }
12422 if (c == '.') {
12423 prec = 0;
12424 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 if (c == '*') {
12427 v = getnextarg(args, arglen, &argidx);
12428 if (v == NULL)
12429 goto onError;
12430 if (!PyLong_Check(v)) {
12431 PyErr_SetString(PyExc_TypeError,
12432 "* wants int");
12433 goto onError;
12434 }
12435 prec = PyLong_AsLong(v);
12436 if (prec == -1 && PyErr_Occurred())
12437 goto onError;
12438 if (prec < 0)
12439 prec = 0;
12440 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 }
12443 else if (c >= '0' && c <= '9') {
12444 prec = c - '0';
12445 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 if (c < '0' || c > '9')
12448 break;
12449 if ((prec*10) / 10 != prec) {
12450 PyErr_SetString(PyExc_ValueError,
12451 "prec too big");
12452 goto onError;
12453 }
12454 prec = prec*10 + (c - '0');
12455 }
12456 }
12457 } /* prec */
12458 if (fmtcnt >= 0) {
12459 if (c == 'h' || c == 'l' || c == 'L') {
12460 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 }
12463 }
12464 if (fmtcnt < 0) {
12465 PyErr_SetString(PyExc_ValueError,
12466 "incomplete format");
12467 goto onError;
12468 }
12469 if (c != '%') {
12470 v = getnextarg(args, arglen, &argidx);
12471 if (v == NULL)
12472 goto onError;
12473 }
12474 sign = 0;
12475 fill = ' ';
12476 switch (c) {
12477
12478 case '%':
12479 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 len = 1;
12484 break;
12485
12486 case 's':
12487 case 'r':
12488 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012489 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 temp = v;
12491 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 }
12493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012494 if (c == 's')
12495 temp = PyObject_Str(v);
12496 else if (c == 'r')
12497 temp = PyObject_Repr(v);
12498 else
12499 temp = PyObject_ASCII(v);
12500 if (temp == NULL)
12501 goto onError;
12502 if (PyUnicode_Check(temp))
12503 /* nothing to do */;
12504 else {
12505 Py_DECREF(temp);
12506 PyErr_SetString(PyExc_TypeError,
12507 "%s argument has non-string str()");
12508 goto onError;
12509 }
12510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 if (PyUnicode_READY(temp) == -1) {
12512 Py_CLEAR(temp);
12513 goto onError;
12514 }
12515 pbuf = PyUnicode_DATA(temp);
12516 kind = PyUnicode_KIND(temp);
12517 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 if (prec >= 0 && len > prec)
12519 len = prec;
12520 break;
12521
12522 case 'i':
12523 case 'd':
12524 case 'u':
12525 case 'o':
12526 case 'x':
12527 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012528 isnumok = 0;
12529 if (PyNumber_Check(v)) {
12530 PyObject *iobj=NULL;
12531
12532 if (PyLong_Check(v)) {
12533 iobj = v;
12534 Py_INCREF(iobj);
12535 }
12536 else {
12537 iobj = PyNumber_Long(v);
12538 }
12539 if (iobj!=NULL) {
12540 if (PyLong_Check(iobj)) {
12541 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012542 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012543 Py_DECREF(iobj);
12544 if (!temp)
12545 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 if (PyUnicode_READY(temp) == -1) {
12547 Py_CLEAR(temp);
12548 goto onError;
12549 }
12550 pbuf = PyUnicode_DATA(temp);
12551 kind = PyUnicode_KIND(temp);
12552 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 sign = 1;
12554 }
12555 else {
12556 Py_DECREF(iobj);
12557 }
12558 }
12559 }
12560 if (!isnumok) {
12561 PyErr_Format(PyExc_TypeError,
12562 "%%%c format: a number is required, "
12563 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12564 goto onError;
12565 }
12566 if (flags & F_ZERO)
12567 fill = '0';
12568 break;
12569
12570 case 'e':
12571 case 'E':
12572 case 'f':
12573 case 'F':
12574 case 'g':
12575 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012576 temp = formatfloat(v, flags, prec, c);
12577 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 if (PyUnicode_READY(temp) == -1) {
12580 Py_CLEAR(temp);
12581 goto onError;
12582 }
12583 pbuf = PyUnicode_DATA(temp);
12584 kind = PyUnicode_KIND(temp);
12585 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012586 sign = 1;
12587 if (flags & F_ZERO)
12588 fill = '0';
12589 break;
12590
12591 case 'c':
12592 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012594 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 if (len < 0)
12596 goto onError;
12597 break;
12598
12599 default:
12600 PyErr_Format(PyExc_ValueError,
12601 "unsupported format character '%c' (0x%x) "
12602 "at index %zd",
12603 (31<=c && c<=126) ? (char)c : '?',
12604 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 goto onError;
12607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 /* pbuf is initialized here. */
12609 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12612 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12613 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 len--;
12615 }
12616 else if (flags & F_SIGN)
12617 sign = '+';
12618 else if (flags & F_BLANK)
12619 sign = ' ';
12620 else
12621 sign = 0;
12622 }
12623 if (width < len)
12624 width = len;
12625 if (rescnt - (sign != 0) < width) {
12626 reslen -= rescnt;
12627 rescnt = width + fmtcnt + 100;
12628 reslen += rescnt;
12629 if (reslen < 0) {
12630 Py_XDECREF(temp);
12631 PyErr_NoMemory();
12632 goto onError;
12633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12635 if (res0 == 0) {
12636 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 Py_XDECREF(temp);
12638 goto onError;
12639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 }
12642 if (sign) {
12643 if (fill != ' ')
12644 *res++ = sign;
12645 rescnt--;
12646 if (width > len)
12647 width--;
12648 }
12649 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12651 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12654 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 }
12656 rescnt -= 2;
12657 width -= 2;
12658 if (width < 0)
12659 width = 0;
12660 len -= 2;
12661 }
12662 if (width > len && !(flags & F_LJUST)) {
12663 do {
12664 --rescnt;
12665 *res++ = fill;
12666 } while (--width > len);
12667 }
12668 if (fill == ' ') {
12669 if (sign)
12670 *res++ = sign;
12671 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12673 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12674 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12675 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012676 }
12677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 /* Copy all characters, preserving len */
12679 len1 = len;
12680 while (len1--) {
12681 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12682 rescnt--;
12683 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 while (--width >= len) {
12685 --rescnt;
12686 *res++ = ' ';
12687 }
12688 if (dict && (argidx < arglen) && c != '%') {
12689 PyErr_SetString(PyExc_TypeError,
12690 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012691 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 goto onError;
12693 }
12694 Py_XDECREF(temp);
12695 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696 } /* until end */
12697 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 PyErr_SetString(PyExc_TypeError,
12699 "not all arguments converted during string formatting");
12700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701 }
12702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703
12704 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12705 if (*res > max)
12706 max = *res;
12707 result = PyUnicode_New(reslen - rescnt, max);
12708 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 kind = PyUnicode_KIND(result);
12711 for (res = res0; res < res0+reslen-rescnt; res++)
12712 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12713 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716 }
12717 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718 return (PyObject *)result;
12719
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 Py_DECREF(uformat);
12723 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725 }
12726 return NULL;
12727}
12728
Jeremy Hylton938ace62002-07-17 16:30:39 +000012729static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012730unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12731
Tim Peters6d6c1a32001-08-02 04:15:00 +000012732static PyObject *
12733unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12734{
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 static char *kwlist[] = {"object", "encoding", "errors", 0};
12737 char *encoding = NULL;
12738 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012739
Benjamin Peterson14339b62009-01-31 16:36:08 +000012740 if (type != &PyUnicode_Type)
12741 return unicode_subtype_new(type, args, kwds);
12742 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 return NULL;
12745 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012747 if (encoding == NULL && errors == NULL)
12748 return PyObject_Str(x);
12749 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012751}
12752
Guido van Rossume023fe02001-08-30 03:12:59 +000012753static PyObject *
12754unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12755{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012756 PyUnicodeObject *unicode, *self;
12757 Py_ssize_t length, char_size;
12758 int share_wstr, share_utf8;
12759 unsigned int kind;
12760 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012761
Benjamin Peterson14339b62009-01-31 16:36:08 +000012762 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012763
12764 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12765 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012767 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012768 if (PyUnicode_READY(unicode))
12769 return NULL;
12770
12771 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12772 if (self == NULL) {
12773 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012774 return NULL;
12775 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012776 kind = PyUnicode_KIND(unicode);
12777 length = PyUnicode_GET_LENGTH(unicode);
12778
12779 _PyUnicode_LENGTH(self) = length;
12780 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12781 _PyUnicode_STATE(self).interned = 0;
12782 _PyUnicode_STATE(self).kind = kind;
12783 _PyUnicode_STATE(self).compact = 0;
12784 _PyUnicode_STATE(self).ascii = 0;
12785 _PyUnicode_STATE(self).ready = 1;
12786 _PyUnicode_WSTR(self) = NULL;
12787 _PyUnicode_UTF8_LENGTH(self) = 0;
12788 _PyUnicode_UTF8(self) = NULL;
12789 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012790 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012791
12792 share_utf8 = 0;
12793 share_wstr = 0;
12794 if (kind == PyUnicode_1BYTE_KIND) {
12795 char_size = 1;
12796 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12797 share_utf8 = 1;
12798 }
12799 else if (kind == PyUnicode_2BYTE_KIND) {
12800 char_size = 2;
12801 if (sizeof(wchar_t) == 2)
12802 share_wstr = 1;
12803 }
12804 else {
12805 assert(kind == PyUnicode_4BYTE_KIND);
12806 char_size = 4;
12807 if (sizeof(wchar_t) == 4)
12808 share_wstr = 1;
12809 }
12810
12811 /* Ensure we won't overflow the length. */
12812 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12813 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012815 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012816 data = PyObject_MALLOC((length + 1) * char_size);
12817 if (data == NULL) {
12818 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 goto onError;
12820 }
12821
Victor Stinnerc3c74152011-10-02 20:39:55 +020012822 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012823 if (share_utf8) {
12824 _PyUnicode_UTF8_LENGTH(self) = length;
12825 _PyUnicode_UTF8(self) = data;
12826 }
12827 if (share_wstr) {
12828 _PyUnicode_WSTR_LENGTH(self) = length;
12829 _PyUnicode_WSTR(self) = (wchar_t *)data;
12830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012832 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12833 PyUnicode_KIND_SIZE(kind, length + 1));
12834 Py_DECREF(unicode);
12835 return (PyObject *)self;
12836
12837onError:
12838 Py_DECREF(unicode);
12839 Py_DECREF(self);
12840 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012841}
12842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012843PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012844 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012845\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012846Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012847encoding defaults to the current default string encoding.\n\
12848errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012849
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012850static PyObject *unicode_iter(PyObject *seq);
12851
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012853 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 "str", /* tp_name */
12855 sizeof(PyUnicodeObject), /* tp_size */
12856 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 (destructor)unicode_dealloc, /* tp_dealloc */
12859 0, /* tp_print */
12860 0, /* tp_getattr */
12861 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012862 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012863 unicode_repr, /* tp_repr */
12864 &unicode_as_number, /* tp_as_number */
12865 &unicode_as_sequence, /* tp_as_sequence */
12866 &unicode_as_mapping, /* tp_as_mapping */
12867 (hashfunc) unicode_hash, /* tp_hash*/
12868 0, /* tp_call*/
12869 (reprfunc) unicode_str, /* tp_str */
12870 PyObject_GenericGetAttr, /* tp_getattro */
12871 0, /* tp_setattro */
12872 0, /* tp_as_buffer */
12873 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012875 unicode_doc, /* tp_doc */
12876 0, /* tp_traverse */
12877 0, /* tp_clear */
12878 PyUnicode_RichCompare, /* tp_richcompare */
12879 0, /* tp_weaklistoffset */
12880 unicode_iter, /* tp_iter */
12881 0, /* tp_iternext */
12882 unicode_methods, /* tp_methods */
12883 0, /* tp_members */
12884 0, /* tp_getset */
12885 &PyBaseObject_Type, /* tp_base */
12886 0, /* tp_dict */
12887 0, /* tp_descr_get */
12888 0, /* tp_descr_set */
12889 0, /* tp_dictoffset */
12890 0, /* tp_init */
12891 0, /* tp_alloc */
12892 unicode_new, /* tp_new */
12893 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894};
12895
12896/* Initialize the Unicode implementation */
12897
Thomas Wouters78890102000-07-22 19:25:51 +000012898void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012900 int i;
12901
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012904 0x000A, /* LINE FEED */
12905 0x000D, /* CARRIAGE RETURN */
12906 0x001C, /* FILE SEPARATOR */
12907 0x001D, /* GROUP SEPARATOR */
12908 0x001E, /* RECORD SEPARATOR */
12909 0x0085, /* NEXT LINE */
12910 0x2028, /* LINE SEPARATOR */
12911 0x2029, /* PARAGRAPH SEPARATOR */
12912 };
12913
Fred Drakee4315f52000-05-09 19:53:39 +000012914 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012915 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012916 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012918
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012919 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012920 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012921 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012922 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923
12924 /* initialize the linebreak bloom filter */
12925 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012927 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012928
12929 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930}
12931
12932/* Finalize the Unicode implementation */
12933
Christian Heimesa156e092008-02-16 07:38:31 +000012934int
12935PyUnicode_ClearFreeList(void)
12936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012938}
12939
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940void
Thomas Wouters78890102000-07-22 19:25:51 +000012941_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012943 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012945 Py_XDECREF(unicode_empty);
12946 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012947
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012948 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012949 if (unicode_latin1[i]) {
12950 Py_DECREF(unicode_latin1[i]);
12951 unicode_latin1[i] = NULL;
12952 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012953 }
Christian Heimesa156e092008-02-16 07:38:31 +000012954 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012956
Walter Dörwald16807132007-05-25 13:52:07 +000012957void
12958PyUnicode_InternInPlace(PyObject **p)
12959{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012960 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12961 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012962#ifdef Py_DEBUG
12963 assert(s != NULL);
12964 assert(_PyUnicode_CHECK(s));
12965#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012966 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012967 return;
12968#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012969 /* If it's a subclass, we don't really know what putting
12970 it in the interned dict might do. */
12971 if (!PyUnicode_CheckExact(s))
12972 return;
12973 if (PyUnicode_CHECK_INTERNED(s))
12974 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012976 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 return;
12978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012979 if (interned == NULL) {
12980 interned = PyDict_New();
12981 if (interned == NULL) {
12982 PyErr_Clear(); /* Don't leave an exception */
12983 return;
12984 }
12985 }
12986 /* It might be that the GetItem call fails even
12987 though the key is present in the dictionary,
12988 namely when this happens during a stack overflow. */
12989 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012990 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012991 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012992
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 if (t) {
12994 Py_INCREF(t);
12995 Py_DECREF(*p);
12996 *p = t;
12997 return;
12998 }
Walter Dörwald16807132007-05-25 13:52:07 +000012999
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000 PyThreadState_GET()->recursion_critical = 1;
13001 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13002 PyErr_Clear();
13003 PyThreadState_GET()->recursion_critical = 0;
13004 return;
13005 }
13006 PyThreadState_GET()->recursion_critical = 0;
13007 /* The two references in interned are not counted by refcnt.
13008 The deallocator will take care of this */
13009 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013011}
13012
13013void
13014PyUnicode_InternImmortal(PyObject **p)
13015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13017
Benjamin Peterson14339b62009-01-31 16:36:08 +000013018 PyUnicode_InternInPlace(p);
13019 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013021 Py_INCREF(*p);
13022 }
Walter Dörwald16807132007-05-25 13:52:07 +000013023}
13024
13025PyObject *
13026PyUnicode_InternFromString(const char *cp)
13027{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013028 PyObject *s = PyUnicode_FromString(cp);
13029 if (s == NULL)
13030 return NULL;
13031 PyUnicode_InternInPlace(&s);
13032 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013033}
13034
Alexander Belopolsky40018472011-02-26 01:02:56 +000013035void
13036_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013037{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013038 PyObject *keys;
13039 PyUnicodeObject *s;
13040 Py_ssize_t i, n;
13041 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013042
Benjamin Peterson14339b62009-01-31 16:36:08 +000013043 if (interned == NULL || !PyDict_Check(interned))
13044 return;
13045 keys = PyDict_Keys(interned);
13046 if (keys == NULL || !PyList_Check(keys)) {
13047 PyErr_Clear();
13048 return;
13049 }
Walter Dörwald16807132007-05-25 13:52:07 +000013050
Benjamin Peterson14339b62009-01-31 16:36:08 +000013051 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13052 detector, interned unicode strings are not forcibly deallocated;
13053 rather, we give them their stolen references back, and then clear
13054 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013055
Benjamin Peterson14339b62009-01-31 16:36:08 +000013056 n = PyList_GET_SIZE(keys);
13057 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013059 for (i = 0; i < n; i++) {
13060 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 if (PyUnicode_READY(s) == -1)
13062 fprintf(stderr, "could not ready string\n");
13063 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013064 case SSTATE_NOT_INTERNED:
13065 /* XXX Shouldn't happen */
13066 break;
13067 case SSTATE_INTERNED_IMMORTAL:
13068 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013070 break;
13071 case SSTATE_INTERNED_MORTAL:
13072 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013074 break;
13075 default:
13076 Py_FatalError("Inconsistent interned string state.");
13077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013079 }
13080 fprintf(stderr, "total size of all interned strings: "
13081 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13082 "mortal/immortal\n", mortal_size, immortal_size);
13083 Py_DECREF(keys);
13084 PyDict_Clear(interned);
13085 Py_DECREF(interned);
13086 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013087}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013088
13089
13090/********************* Unicode Iterator **************************/
13091
13092typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 PyObject_HEAD
13094 Py_ssize_t it_index;
13095 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013096} unicodeiterobject;
13097
13098static void
13099unicodeiter_dealloc(unicodeiterobject *it)
13100{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013101 _PyObject_GC_UNTRACK(it);
13102 Py_XDECREF(it->it_seq);
13103 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013104}
13105
13106static int
13107unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13108{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 Py_VISIT(it->it_seq);
13110 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013111}
13112
13113static PyObject *
13114unicodeiter_next(unicodeiterobject *it)
13115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 PyUnicodeObject *seq;
13117 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013118
Benjamin Peterson14339b62009-01-31 16:36:08 +000013119 assert(it != NULL);
13120 seq = it->it_seq;
13121 if (seq == NULL)
13122 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013123 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13126 int kind = PyUnicode_KIND(seq);
13127 void *data = PyUnicode_DATA(seq);
13128 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13129 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 if (item != NULL)
13131 ++it->it_index;
13132 return item;
13133 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013134
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 Py_DECREF(seq);
13136 it->it_seq = NULL;
13137 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013138}
13139
13140static PyObject *
13141unicodeiter_len(unicodeiterobject *it)
13142{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013143 Py_ssize_t len = 0;
13144 if (it->it_seq)
13145 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13146 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013147}
13148
13149PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13150
13151static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013155};
13156
13157PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13159 "str_iterator", /* tp_name */
13160 sizeof(unicodeiterobject), /* tp_basicsize */
13161 0, /* tp_itemsize */
13162 /* methods */
13163 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13164 0, /* tp_print */
13165 0, /* tp_getattr */
13166 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013167 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 0, /* tp_repr */
13169 0, /* tp_as_number */
13170 0, /* tp_as_sequence */
13171 0, /* tp_as_mapping */
13172 0, /* tp_hash */
13173 0, /* tp_call */
13174 0, /* tp_str */
13175 PyObject_GenericGetAttr, /* tp_getattro */
13176 0, /* tp_setattro */
13177 0, /* tp_as_buffer */
13178 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13179 0, /* tp_doc */
13180 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13181 0, /* tp_clear */
13182 0, /* tp_richcompare */
13183 0, /* tp_weaklistoffset */
13184 PyObject_SelfIter, /* tp_iter */
13185 (iternextfunc)unicodeiter_next, /* tp_iternext */
13186 unicodeiter_methods, /* tp_methods */
13187 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013188};
13189
13190static PyObject *
13191unicode_iter(PyObject *seq)
13192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013194
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195 if (!PyUnicode_Check(seq)) {
13196 PyErr_BadInternalCall();
13197 return NULL;
13198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 if (PyUnicode_READY(seq) == -1)
13200 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13202 if (it == NULL)
13203 return NULL;
13204 it->it_index = 0;
13205 Py_INCREF(seq);
13206 it->it_seq = (PyUnicodeObject *)seq;
13207 _PyObject_GC_TRACK(it);
13208 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013209}
13210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211#define UNIOP(x) Py_UNICODE_##x
13212#define UNIOP_t Py_UNICODE
13213#include "uniops.h"
13214#undef UNIOP
13215#undef UNIOP_t
13216#define UNIOP(x) Py_UCS4_##x
13217#define UNIOP_t Py_UCS4
13218#include "uniops.h"
13219#undef UNIOP
13220#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013221
Victor Stinner71133ff2010-09-01 23:43:53 +000013222Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013223PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013224{
13225 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13226 Py_UNICODE *copy;
13227 Py_ssize_t size;
13228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 if (!PyUnicode_Check(unicode)) {
13230 PyErr_BadArgument();
13231 return NULL;
13232 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013233 /* Ensure we won't overflow the size. */
13234 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13235 PyErr_NoMemory();
13236 return NULL;
13237 }
13238 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13239 size *= sizeof(Py_UNICODE);
13240 copy = PyMem_Malloc(size);
13241 if (copy == NULL) {
13242 PyErr_NoMemory();
13243 return NULL;
13244 }
13245 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13246 return copy;
13247}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013248
Georg Brandl66c221e2010-10-14 07:04:07 +000013249/* A _string module, to export formatter_parser and formatter_field_name_split
13250 to the string.Formatter class implemented in Python. */
13251
13252static PyMethodDef _string_methods[] = {
13253 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13254 METH_O, PyDoc_STR("split the argument as a field name")},
13255 {"formatter_parser", (PyCFunction) formatter_parser,
13256 METH_O, PyDoc_STR("parse the argument as a format string")},
13257 {NULL, NULL}
13258};
13259
13260static struct PyModuleDef _string_module = {
13261 PyModuleDef_HEAD_INIT,
13262 "_string",
13263 PyDoc_STR("string helper module"),
13264 0,
13265 _string_methods,
13266 NULL,
13267 NULL,
13268 NULL,
13269 NULL
13270};
13271
13272PyMODINIT_FUNC
13273PyInit__string(void)
13274{
13275 return PyModule_Create(&_string_module);
13276}
13277
13278
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013279#ifdef __cplusplus
13280}
13281#endif