blob: 4a7815394eb176082df8e7ebb87509deb04f1f24 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
446 int share_wstr;
447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
453
454 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
455 PyErr_NoMemory();
456 return -1;
457 }
458 new_size = (length + 1) * char_size;
459
460 data = (PyObject *)PyObject_REALLOC(data, new_size);
461 if (data == NULL) {
462 PyErr_NoMemory();
463 return -1;
464 }
465 _PyUnicode_DATA_ANY(unicode) = data;
466 if (share_wstr)
467 _PyUnicode_WSTR(unicode) = data;
468 _PyUnicode_LENGTH(unicode) = length;
469 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
470 if (share_wstr)
471 return 0;
472 }
473 if (_PyUnicode_WSTR(unicode) != NULL) {
474 assert(_PyUnicode_WSTR(unicode) != NULL);
475
476 oldstr = _PyUnicode_WSTR(unicode);
477 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
478 sizeof(Py_UNICODE) * (length + 1));
479 if (!_PyUnicode_WSTR(unicode)) {
480 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
481 PyErr_NoMemory();
482 return -1;
483 }
484 _PyUnicode_WSTR(unicode)[length] = 0;
485 _PyUnicode_WSTR_LENGTH(unicode) = length;
486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487 return 0;
488}
489
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490static PyObject*
491resize_copy(PyObject *unicode, Py_ssize_t length)
492{
493 Py_ssize_t copy_length;
494 if (PyUnicode_IS_COMPACT(unicode)) {
495 PyObject *copy;
496 assert(PyUnicode_IS_READY(unicode));
497
498 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
499 if (copy == NULL)
500 return NULL;
501
502 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
503 if (PyUnicode_CopyCharacters(copy, 0,
504 unicode, 0,
505 copy_length) < 0)
506 {
507 Py_DECREF(copy);
508 return NULL;
509 }
510 return copy;
511 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200512 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200513 assert(_PyUnicode_WSTR(unicode) != NULL);
514 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200515 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200516 if (w == NULL)
517 return NULL;
518 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
519 copy_length = Py_MIN(copy_length, length);
520 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
521 copy_length);
522 return (PyObject*)w;
523 }
524}
525
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000527 Ux0000 terminated; some code (e.g. new_identifier)
528 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529
530 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532
533*/
534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535#ifdef Py_DEBUG
536int unicode_old_new_calls = 0;
537#endif
538
Alexander Belopolsky40018472011-02-26 01:02:56 +0000539static PyUnicodeObject *
540_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541{
542 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 if (length == 0 && unicode_empty != NULL) {
547 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200548 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 }
550
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000551 /* Ensure we won't overflow the size. */
552 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
553 return (PyUnicodeObject *)PyErr_NoMemory();
554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 if (length < 0) {
556 PyErr_SetString(PyExc_SystemError,
557 "Negative size passed to _PyUnicode_New");
558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 }
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#ifdef Py_DEBUG
562 ++unicode_old_new_calls;
563#endif
564
565 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
566 if (unicode == NULL)
567 return NULL;
568 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
569 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
570 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000571 PyErr_NoMemory();
572 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574
Jeremy Hyltond8082792003-09-16 19:41:39 +0000575 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000576 * the caller fails before initializing str -- unicode_resize()
577 * reads str[0], and the Keep-Alive optimization can keep memory
578 * allocated for str alive across a call to unicode_dealloc(unicode).
579 * We don't want unicode_resize to read uninitialized memory in
580 * that case.
581 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582 _PyUnicode_WSTR(unicode)[0] = 0;
583 _PyUnicode_WSTR(unicode)[length] = 0;
584 _PyUnicode_WSTR_LENGTH(unicode) = length;
585 _PyUnicode_HASH(unicode) = -1;
586 _PyUnicode_STATE(unicode).interned = 0;
587 _PyUnicode_STATE(unicode).kind = 0;
588 _PyUnicode_STATE(unicode).compact = 0;
589 _PyUnicode_STATE(unicode).ready = 0;
590 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200591 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200593 _PyUnicode_UTF8(unicode) = NULL;
594 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000596
Benjamin Peterson29060642009-01-31 22:14:21 +0000597 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000598 /* XXX UNREF/NEWREF interface should be more symmetrical */
599 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000600 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000601 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603}
604
Victor Stinnerf42dc442011-10-02 23:33:16 +0200605static const char*
606unicode_kind_name(PyObject *unicode)
607{
Victor Stinner910337b2011-10-03 03:20:16 +0200608 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200609 if (!PyUnicode_IS_COMPACT(unicode))
610 {
611 if (!PyUnicode_IS_READY(unicode))
612 return "wstr";
613 switch(PyUnicode_KIND(unicode))
614 {
615 case PyUnicode_1BYTE_KIND:
616 if (PyUnicode_IS_COMPACT_ASCII(unicode))
617 return "legacy ascii";
618 else
619 return "legacy latin1";
620 case PyUnicode_2BYTE_KIND:
621 return "legacy UCS2";
622 case PyUnicode_4BYTE_KIND:
623 return "legacy UCS4";
624 default:
625 return "<legacy invalid kind>";
626 }
627 }
628 assert(PyUnicode_IS_READY(unicode));
629 switch(PyUnicode_KIND(unicode))
630 {
631 case PyUnicode_1BYTE_KIND:
632 if (PyUnicode_IS_COMPACT_ASCII(unicode))
633 return "ascii";
634 else
635 return "compact latin1";
636 case PyUnicode_2BYTE_KIND:
637 return "compact UCS2";
638 case PyUnicode_4BYTE_KIND:
639 return "compact UCS4";
640 default:
641 return "<invalid compact kind>";
642 }
643}
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645#ifdef Py_DEBUG
646int unicode_new_new_calls = 0;
647
648/* Functions wrapping macros for use in debugger */
649char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200650 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651}
652
653void *_PyUnicode_compact_data(void *unicode) {
654 return _PyUnicode_COMPACT_DATA(unicode);
655}
656void *_PyUnicode_data(void *unicode){
657 printf("obj %p\n", unicode);
658 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
659 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
660 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
661 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
662 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
663 return PyUnicode_DATA(unicode);
664}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200665
666void
667_PyUnicode_Dump(PyObject *op)
668{
669 PyASCIIObject *ascii = (PyASCIIObject *)op;
670 printf("%s: len=%zu, wstr=%p",
671 unicode_kind_name(op),
672 ascii->length,
673 ascii->wstr);
674 if (!ascii->state.ascii) {
675 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
676 printf(" (%zu), utf8=%p (%zu)",
677 compact->wstr_length,
678 compact->utf8,
679 compact->utf8_length);
680 }
681 if (!ascii->state.compact) {
682 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
683 printf(", data=%p",
684 unicode->data.any);
685 }
686 printf("\n");
687}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200688#endif
689
690PyObject *
691PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
692{
693 PyObject *obj;
694 PyCompactUnicodeObject *unicode;
695 void *data;
696 int kind_state;
697 int is_sharing = 0, is_ascii = 0;
698 Py_ssize_t char_size;
699 Py_ssize_t struct_size;
700
701 /* Optimization for empty strings */
702 if (size == 0 && unicode_empty != NULL) {
703 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200704 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705 }
706
707#ifdef Py_DEBUG
708 ++unicode_new_new_calls;
709#endif
710
711 struct_size = sizeof(PyCompactUnicodeObject);
712 if (maxchar < 128) {
713 kind_state = PyUnicode_1BYTE_KIND;
714 char_size = 1;
715 is_ascii = 1;
716 struct_size = sizeof(PyASCIIObject);
717 }
718 else if (maxchar < 256) {
719 kind_state = PyUnicode_1BYTE_KIND;
720 char_size = 1;
721 }
722 else if (maxchar < 65536) {
723 kind_state = PyUnicode_2BYTE_KIND;
724 char_size = 2;
725 if (sizeof(wchar_t) == 2)
726 is_sharing = 1;
727 }
728 else {
729 kind_state = PyUnicode_4BYTE_KIND;
730 char_size = 4;
731 if (sizeof(wchar_t) == 4)
732 is_sharing = 1;
733 }
734
735 /* Ensure we won't overflow the size. */
736 if (size < 0) {
737 PyErr_SetString(PyExc_SystemError,
738 "Negative size passed to PyUnicode_New");
739 return NULL;
740 }
741 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
742 return PyErr_NoMemory();
743
744 /* Duplicated allocation code from _PyObject_New() instead of a call to
745 * PyObject_New() so we are able to allocate space for the object and
746 * it's data buffer.
747 */
748 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
749 if (obj == NULL)
750 return PyErr_NoMemory();
751 obj = PyObject_INIT(obj, &PyUnicode_Type);
752 if (obj == NULL)
753 return NULL;
754
755 unicode = (PyCompactUnicodeObject *)obj;
756 if (is_ascii)
757 data = ((PyASCIIObject*)obj) + 1;
758 else
759 data = unicode + 1;
760 _PyUnicode_LENGTH(unicode) = size;
761 _PyUnicode_HASH(unicode) = -1;
762 _PyUnicode_STATE(unicode).interned = 0;
763 _PyUnicode_STATE(unicode).kind = kind_state;
764 _PyUnicode_STATE(unicode).compact = 1;
765 _PyUnicode_STATE(unicode).ready = 1;
766 _PyUnicode_STATE(unicode).ascii = is_ascii;
767 if (is_ascii) {
768 ((char*)data)[size] = 0;
769 _PyUnicode_WSTR(unicode) = NULL;
770 }
771 else if (kind_state == PyUnicode_1BYTE_KIND) {
772 ((char*)data)[size] = 0;
773 _PyUnicode_WSTR(unicode) = NULL;
774 _PyUnicode_WSTR_LENGTH(unicode) = 0;
775 unicode->utf8_length = 0;
776 unicode->utf8 = NULL;
777 }
778 else {
779 unicode->utf8 = NULL;
780 if (kind_state == PyUnicode_2BYTE_KIND)
781 ((Py_UCS2*)data)[size] = 0;
782 else /* kind_state == PyUnicode_4BYTE_KIND */
783 ((Py_UCS4*)data)[size] = 0;
784 if (is_sharing) {
785 _PyUnicode_WSTR_LENGTH(unicode) = size;
786 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
787 }
788 else {
789 _PyUnicode_WSTR_LENGTH(unicode) = 0;
790 _PyUnicode_WSTR(unicode) = NULL;
791 }
792 }
793 return obj;
794}
795
796#if SIZEOF_WCHAR_T == 2
797/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
798 will decode surrogate pairs, the other conversions are implemented as macros
799 for efficency.
800
801 This function assumes that unicode can hold one more code point than wstr
802 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200803static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
805 PyUnicodeObject *unicode)
806{
807 const wchar_t *iter;
808 Py_UCS4 *ucs4_out;
809
Victor Stinner910337b2011-10-03 03:20:16 +0200810 assert(unicode != NULL);
811 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
813 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
814
815 for (iter = begin; iter < end; ) {
816 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
817 _PyUnicode_GET_LENGTH(unicode)));
818 if (*iter >= 0xD800 && *iter <= 0xDBFF
819 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
820 {
821 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
822 iter += 2;
823 }
824 else {
825 *ucs4_out++ = *iter;
826 iter++;
827 }
828 }
829 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
830 _PyUnicode_GET_LENGTH(unicode)));
831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832}
833#endif
834
Victor Stinnercd9950f2011-10-02 00:34:53 +0200835static int
836_PyUnicode_Dirty(PyObject *unicode)
837{
Victor Stinner910337b2011-10-03 03:20:16 +0200838 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200839 if (Py_REFCNT(unicode) != 1) {
840 PyErr_SetString(PyExc_ValueError,
841 "Cannot modify a string having more than 1 reference");
842 return -1;
843 }
844 _PyUnicode_DIRTY(unicode);
845 return 0;
846}
847
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200848Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
850 PyObject *from, Py_ssize_t from_start,
851 Py_ssize_t how_many)
852{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200853 unsigned int from_kind, to_kind;
854 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855
Victor Stinnerb1536152011-09-30 02:26:10 +0200856 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
857 PyErr_BadInternalCall();
858 return -1;
859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860
861 if (PyUnicode_READY(from))
862 return -1;
863 if (PyUnicode_READY(to))
864 return -1;
865
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200866 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200867 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
868 PyErr_Format(PyExc_ValueError,
869 "Cannot write %zi characters at %zi "
870 "in a string of %zi characters",
871 how_many, to_start, PyUnicode_GET_LENGTH(to));
872 return -1;
873 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200874 if (how_many == 0)
875 return 0;
876
Victor Stinnercd9950f2011-10-02 00:34:53 +0200877 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200878 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200881 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200883 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 if (from_kind == to_kind
886 /* deny latin1 => ascii */
887 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
888 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200889 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200890 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200891 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200892 + PyUnicode_KIND_SIZE(from_kind, from_start),
893 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200895 else if (from_kind == PyUnicode_1BYTE_KIND
896 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200897 {
898 _PyUnicode_CONVERT_BYTES(
899 Py_UCS1, Py_UCS2,
900 PyUnicode_1BYTE_DATA(from) + from_start,
901 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
902 PyUnicode_2BYTE_DATA(to) + to_start
903 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200904 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200905 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200906 && to_kind == PyUnicode_4BYTE_KIND)
907 {
908 _PyUnicode_CONVERT_BYTES(
909 Py_UCS1, Py_UCS4,
910 PyUnicode_1BYTE_DATA(from) + from_start,
911 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
912 PyUnicode_4BYTE_DATA(to) + to_start
913 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200914 }
915 else if (from_kind == PyUnicode_2BYTE_KIND
916 && to_kind == PyUnicode_4BYTE_KIND)
917 {
918 _PyUnicode_CONVERT_BYTES(
919 Py_UCS2, Py_UCS4,
920 PyUnicode_2BYTE_DATA(from) + from_start,
921 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
922 PyUnicode_4BYTE_DATA(to) + to_start
923 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200924 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200925 else {
926 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200927
928 /* check if max_char(from substring) <= max_char(to) */
929 if (from_kind > to_kind
930 /* latin1 => ascii */
931 || (PyUnicode_IS_COMPACT_ASCII(to)
932 && to_kind == PyUnicode_1BYTE_KIND
933 && !PyUnicode_IS_COMPACT_ASCII(from)))
934 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200935 /* slow path to check for character overflow */
936 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
937 Py_UCS4 ch, maxchar;
938 Py_ssize_t i;
939
940 maxchar = 0;
941 invalid_kinds = 0;
942 for (i=0; i < how_many; i++) {
943 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
944 if (ch > maxchar) {
945 maxchar = ch;
946 if (maxchar > to_maxchar) {
947 invalid_kinds = 1;
948 break;
949 }
950 }
951 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
952 }
953 }
954 else
955 invalid_kinds = 1;
956 if (invalid_kinds) {
957 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958 "Cannot copy %s characters "
959 "into a string of %s characters",
960 unicode_kind_name(from),
961 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200962 return -1;
963 }
964 }
965 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966}
967
Victor Stinner17222162011-09-28 22:15:37 +0200968/* Find the maximum code point and count the number of surrogate pairs so a
969 correct string length can be computed before converting a string to UCS4.
970 This function counts single surrogates as a character and not as a pair.
971
972 Return 0 on success, or -1 on error. */
973static int
974find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
975 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976{
977 const wchar_t *iter;
978
Victor Stinnerc53be962011-10-02 21:33:54 +0200979 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 if (num_surrogates == NULL || maxchar == NULL) {
981 PyErr_SetString(PyExc_SystemError,
982 "unexpected NULL arguments to "
983 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
984 return -1;
985 }
986
987 *num_surrogates = 0;
988 *maxchar = 0;
989
990 for (iter = begin; iter < end; ) {
991 if (*iter > *maxchar)
992 *maxchar = *iter;
993#if SIZEOF_WCHAR_T == 2
994 if (*iter >= 0xD800 && *iter <= 0xDBFF
995 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
996 {
997 Py_UCS4 surrogate_val;
998 surrogate_val = (((iter[0] & 0x3FF)<<10)
999 | (iter[1] & 0x3FF)) + 0x10000;
1000 ++(*num_surrogates);
1001 if (surrogate_val > *maxchar)
1002 *maxchar = surrogate_val;
1003 iter += 2;
1004 }
1005 else
1006 iter++;
1007#else
1008 iter++;
1009#endif
1010 }
1011 return 0;
1012}
1013
1014#ifdef Py_DEBUG
1015int unicode_ready_calls = 0;
1016#endif
1017
1018int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001019_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001021 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 wchar_t *end;
1023 Py_UCS4 maxchar = 0;
1024 Py_ssize_t num_surrogates;
1025#if SIZEOF_WCHAR_T == 2
1026 Py_ssize_t length_wo_surrogates;
1027#endif
1028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001030 strings were created using _PyObject_New() and where no canonical
1031 representation (the str field) has been set yet aka strings
1032 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001033 assert(_PyUnicode_CHECK(unicode));
1034 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001036 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001037 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001038 /* Actually, it should neither be interned nor be anything else: */
1039 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040
1041#ifdef Py_DEBUG
1042 ++unicode_ready_calls;
1043#endif
1044
1045 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001046 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001047 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049
1050 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001051 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1052 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 PyErr_NoMemory();
1054 return -1;
1055 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001056 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 _PyUnicode_WSTR(unicode), end,
1058 PyUnicode_1BYTE_DATA(unicode));
1059 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1060 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1061 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1062 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001063 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001064 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 }
1066 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001067 _PyUnicode_UTF8(unicode) = NULL;
1068 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 }
1070 PyObject_FREE(_PyUnicode_WSTR(unicode));
1071 _PyUnicode_WSTR(unicode) = NULL;
1072 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1073 }
1074 /* In this case we might have to convert down from 4-byte native
1075 wchar_t to 2-byte unicode. */
1076 else if (maxchar < 65536) {
1077 assert(num_surrogates == 0 &&
1078 "FindMaxCharAndNumSurrogatePairs() messed up");
1079
Victor Stinner506f5922011-09-28 22:34:18 +02001080#if SIZEOF_WCHAR_T == 2
1081 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001082 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001083 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1084 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1085 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001086 _PyUnicode_UTF8(unicode) = NULL;
1087 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001088#else
1089 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001090 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001091 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001092 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001093 PyErr_NoMemory();
1094 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 }
Victor Stinner506f5922011-09-28 22:34:18 +02001096 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1097 _PyUnicode_WSTR(unicode), end,
1098 PyUnicode_2BYTE_DATA(unicode));
1099 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1100 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1101 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001104 PyObject_FREE(_PyUnicode_WSTR(unicode));
1105 _PyUnicode_WSTR(unicode) = NULL;
1106 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1107#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 }
1109 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1110 else {
1111#if SIZEOF_WCHAR_T == 2
1112 /* in case the native representation is 2-bytes, we need to allocate a
1113 new normalized 4-byte version. */
1114 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001115 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1116 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 PyErr_NoMemory();
1118 return -1;
1119 }
1120 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1121 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001122 _PyUnicode_UTF8(unicode) = NULL;
1123 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinnerc53be962011-10-02 21:33:54 +02001124 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 PyObject_FREE(_PyUnicode_WSTR(unicode));
1126 _PyUnicode_WSTR(unicode) = NULL;
1127 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1128#else
1129 assert(num_surrogates == 0);
1130
Victor Stinnerc3c74152011-10-02 20:39:55 +02001131 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001133 _PyUnicode_UTF8(unicode) = NULL;
1134 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1136#endif
1137 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1138 }
1139 _PyUnicode_STATE(unicode).ready = 1;
1140 return 0;
1141}
1142
Alexander Belopolsky40018472011-02-26 01:02:56 +00001143static void
1144unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145{
Walter Dörwald16807132007-05-25 13:52:07 +00001146 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001147 case SSTATE_NOT_INTERNED:
1148 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001149
Benjamin Peterson29060642009-01-31 22:14:21 +00001150 case SSTATE_INTERNED_MORTAL:
1151 /* revive dead object temporarily for DelItem */
1152 Py_REFCNT(unicode) = 3;
1153 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1154 Py_FatalError(
1155 "deletion of interned string failed");
1156 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001157
Benjamin Peterson29060642009-01-31 22:14:21 +00001158 case SSTATE_INTERNED_IMMORTAL:
1159 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001160
Benjamin Peterson29060642009-01-31 22:14:21 +00001161 default:
1162 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001163 }
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 if (_PyUnicode_WSTR(unicode) &&
1166 (!PyUnicode_IS_READY(unicode) ||
1167 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1168 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001169 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001170 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171
1172 if (PyUnicode_IS_COMPACT(unicode)) {
1173 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174 }
1175 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001176 if (_PyUnicode_DATA_ANY(unicode))
1177 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001178 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180}
1181
Alexander Belopolsky40018472011-02-26 01:02:56 +00001182static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001183unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001184{
Victor Stinnera3be6132011-10-03 02:16:37 +02001185 Py_ssize_t len;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001186 if (Py_REFCNT(unicode) != 1)
1187 return 0;
1188 if (PyUnicode_CHECK_INTERNED(unicode))
1189 return 0;
1190 if (unicode == unicode_empty)
1191 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001192 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1193 len = PyUnicode_WSTR_LENGTH(unicode);
1194 else
1195 len = PyUnicode_GET_LENGTH(unicode);
1196 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001197 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001198 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001199 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001200 else
1201 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001202 if (ch < 256 && unicode_latin1[ch] == unicode)
1203 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001204 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001205 /* FIXME: reenable resize_inplace */
1206 if (!PyUnicode_IS_COMPACT(unicode))
1207 return 0;
1208 return 1;
1209}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001210
Victor Stinnerfe226c02011-10-03 03:52:20 +02001211static int
1212unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1213{
1214 PyObject *unicode;
1215 Py_ssize_t old_length;
1216
1217 assert(p_unicode != NULL);
1218 unicode = *p_unicode;
1219
1220 assert(unicode != NULL);
1221 assert(PyUnicode_Check(unicode));
1222 assert(0 <= length);
1223
Victor Stinner910337b2011-10-03 03:20:16 +02001224 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001225 old_length = PyUnicode_WSTR_LENGTH(unicode);
1226 else
1227 old_length = PyUnicode_GET_LENGTH(unicode);
1228 if (old_length == length)
1229 return 0;
1230
1231 /* FIXME: really create a new object? */
1232 if (!unicode_resizable(unicode)) {
1233 PyObject *copy = resize_copy(unicode, length);
1234 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001235 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001236 Py_DECREF(*p_unicode);
1237 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001239 }
1240
Victor Stinnerfe226c02011-10-03 03:52:20 +02001241 if (PyUnicode_IS_COMPACT(unicode)) {
1242 *p_unicode = resize_compact(unicode, length);
1243 if (*p_unicode == NULL)
1244 return -1;
1245 return 0;
1246 } else
1247 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001248}
1249
Alexander Belopolsky40018472011-02-26 01:02:56 +00001250int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001251PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001252{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001253 PyObject *unicode;
1254 if (p_unicode == NULL) {
1255 PyErr_BadInternalCall();
1256 return -1;
1257 }
1258 unicode = *p_unicode;
1259 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1260 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1261 {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001266}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268static PyObject*
1269get_latin1_char(unsigned char ch)
1270{
Victor Stinnera464fc12011-10-02 20:39:30 +02001271 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001273 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001274 if (!unicode)
1275 return NULL;
1276 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1277 unicode_latin1[ch] = unicode;
1278 }
1279 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001280 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281}
1282
Alexander Belopolsky40018472011-02-26 01:02:56 +00001283PyObject *
1284PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285{
1286 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287 Py_UCS4 maxchar = 0;
1288 Py_ssize_t num_surrogates;
1289
1290 if (u == NULL)
1291 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001293 /* If the Unicode data is known at construction time, we can apply
1294 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 /* Optimization for empty strings */
1297 if (size == 0 && unicode_empty != NULL) {
1298 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001299 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001300 }
Tim Petersced69f82003-09-16 20:30:58 +00001301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302 /* Single character Unicode objects in the Latin-1 range are
1303 shared when using this constructor */
1304 if (size == 1 && *u < 256)
1305 return get_latin1_char((unsigned char)*u);
1306
1307 /* If not empty and not single character, copy the Unicode data
1308 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001309 if (find_maxchar_surrogates(u, u + size,
1310 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 return NULL;
1312
1313 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1314 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315 if (!unicode)
1316 return NULL;
1317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318 switch (PyUnicode_KIND(unicode)) {
1319 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001320 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1322 break;
1323 case PyUnicode_2BYTE_KIND:
1324#if Py_UNICODE_SIZE == 2
1325 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1326#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001327 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1329#endif
1330 break;
1331 case PyUnicode_4BYTE_KIND:
1332#if SIZEOF_WCHAR_T == 2
1333 /* This is the only case which has to process surrogates, thus
1334 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001335 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336#else
1337 assert(num_surrogates == 0);
1338 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1339#endif
1340 break;
1341 default:
1342 assert(0 && "Impossible state");
1343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344
1345 return (PyObject *)unicode;
1346}
1347
Alexander Belopolsky40018472011-02-26 01:02:56 +00001348PyObject *
1349PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001350{
1351 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001352
Benjamin Peterson14339b62009-01-31 16:36:08 +00001353 if (size < 0) {
1354 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001355 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001356 return NULL;
1357 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001358
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001359 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001360 some optimizations which share commonly used objects.
1361 Also, this means the input must be UTF-8, so fall back to the
1362 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001363 if (u != NULL) {
1364
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 /* Optimization for empty strings */
1366 if (size == 0 && unicode_empty != NULL) {
1367 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001368 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001369 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001370
1371 /* Single characters are shared when using this constructor.
1372 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373 if (size == 1 && Py_CHARMASK(*u) < 128)
1374 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001375
1376 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001377 }
1378
Walter Dörwald55507312007-05-18 13:12:10 +00001379 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001380 if (!unicode)
1381 return NULL;
1382
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001383 return (PyObject *)unicode;
1384}
1385
Alexander Belopolsky40018472011-02-26 01:02:56 +00001386PyObject *
1387PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001388{
1389 size_t size = strlen(u);
1390 if (size > PY_SSIZE_T_MAX) {
1391 PyErr_SetString(PyExc_OverflowError, "input too long");
1392 return NULL;
1393 }
1394
1395 return PyUnicode_FromStringAndSize(u, size);
1396}
1397
Victor Stinnere57b1c02011-09-28 22:20:48 +02001398static PyObject*
1399_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 PyObject *res;
1402 unsigned char max = 127;
1403 Py_ssize_t i;
1404 for (i = 0; i < size; i++) {
1405 if (u[i] & 0x80) {
1406 max = 255;
1407 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001408 }
1409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 res = PyUnicode_New(size, max);
1411 if (!res)
1412 return NULL;
1413 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1414 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001415}
1416
Victor Stinnere57b1c02011-09-28 22:20:48 +02001417static PyObject*
1418_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419{
1420 PyObject *res;
1421 Py_UCS2 max = 0;
1422 Py_ssize_t i;
1423 for (i = 0; i < size; i++)
1424 if (u[i] > max)
1425 max = u[i];
1426 res = PyUnicode_New(size, max);
1427 if (!res)
1428 return NULL;
1429 if (max >= 256)
1430 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1431 else
1432 for (i = 0; i < size; i++)
1433 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1434 return res;
1435}
1436
Victor Stinnere57b1c02011-09-28 22:20:48 +02001437static PyObject*
1438_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439{
1440 PyObject *res;
1441 Py_UCS4 max = 0;
1442 Py_ssize_t i;
1443 for (i = 0; i < size; i++)
1444 if (u[i] > max)
1445 max = u[i];
1446 res = PyUnicode_New(size, max);
1447 if (!res)
1448 return NULL;
1449 if (max >= 0x10000)
1450 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1451 else {
1452 int kind = PyUnicode_KIND(res);
1453 void *data = PyUnicode_DATA(res);
1454 for (i = 0; i < size; i++)
1455 PyUnicode_WRITE(kind, data, i, u[i]);
1456 }
1457 return res;
1458}
1459
1460PyObject*
1461PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1462{
1463 switch(kind) {
1464 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001465 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001467 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001469 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001471 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return NULL;
1473}
1474
Victor Stinner034f6cf2011-09-30 02:26:44 +02001475PyObject*
1476PyUnicode_Copy(PyObject *unicode)
1477{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001478 Py_ssize_t size;
1479 PyObject *copy;
1480 void *data;
1481
Victor Stinner034f6cf2011-09-30 02:26:44 +02001482 if (!PyUnicode_Check(unicode)) {
1483 PyErr_BadInternalCall();
1484 return NULL;
1485 }
1486 if (PyUnicode_READY(unicode))
1487 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001488
1489 size = PyUnicode_GET_LENGTH(unicode);
1490 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1491 if (!copy)
1492 return NULL;
1493 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1494
1495 data = PyUnicode_DATA(unicode);
1496 switch (PyUnicode_KIND(unicode))
1497 {
1498 case PyUnicode_1BYTE_KIND:
1499 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1500 break;
1501 case PyUnicode_2BYTE_KIND:
1502 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1503 break;
1504 case PyUnicode_4BYTE_KIND:
1505 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1506 break;
1507 default:
1508 assert(0);
1509 break;
1510 }
1511 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001512}
1513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514
Victor Stinnerbc603d12011-10-02 01:00:40 +02001515/* Widen Unicode objects to larger buffers. Don't write terminating null
1516 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517
1518void*
1519_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1520{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001521 Py_ssize_t len;
1522 void *result;
1523 unsigned int skind;
1524
1525 if (PyUnicode_READY(s))
1526 return NULL;
1527
1528 len = PyUnicode_GET_LENGTH(s);
1529 skind = PyUnicode_KIND(s);
1530 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1532 return NULL;
1533 }
1534 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001535 case PyUnicode_2BYTE_KIND:
1536 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1537 if (!result)
1538 return PyErr_NoMemory();
1539 assert(skind == PyUnicode_1BYTE_KIND);
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS1, Py_UCS2,
1542 PyUnicode_1BYTE_DATA(s),
1543 PyUnicode_1BYTE_DATA(s) + len,
1544 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001545 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001546 case PyUnicode_4BYTE_KIND:
1547 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1548 if (!result)
1549 return PyErr_NoMemory();
1550 if (skind == PyUnicode_2BYTE_KIND) {
1551 _PyUnicode_CONVERT_BYTES(
1552 Py_UCS2, Py_UCS4,
1553 PyUnicode_2BYTE_DATA(s),
1554 PyUnicode_2BYTE_DATA(s) + len,
1555 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001556 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001557 else {
1558 assert(skind == PyUnicode_1BYTE_KIND);
1559 _PyUnicode_CONVERT_BYTES(
1560 Py_UCS1, Py_UCS4,
1561 PyUnicode_1BYTE_DATA(s),
1562 PyUnicode_1BYTE_DATA(s) + len,
1563 result);
1564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001566 default:
1567 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001569 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 return NULL;
1571}
1572
1573static Py_UCS4*
1574as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1575 int copy_null)
1576{
1577 int kind;
1578 void *data;
1579 Py_ssize_t len, targetlen;
1580 if (PyUnicode_READY(string) == -1)
1581 return NULL;
1582 kind = PyUnicode_KIND(string);
1583 data = PyUnicode_DATA(string);
1584 len = PyUnicode_GET_LENGTH(string);
1585 targetlen = len;
1586 if (copy_null)
1587 targetlen++;
1588 if (!target) {
1589 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1590 PyErr_NoMemory();
1591 return NULL;
1592 }
1593 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1594 if (!target) {
1595 PyErr_NoMemory();
1596 return NULL;
1597 }
1598 }
1599 else {
1600 if (targetsize < targetlen) {
1601 PyErr_Format(PyExc_SystemError,
1602 "string is longer than the buffer");
1603 if (copy_null && 0 < targetsize)
1604 target[0] = 0;
1605 return NULL;
1606 }
1607 }
1608 if (kind != PyUnicode_4BYTE_KIND) {
1609 Py_ssize_t i;
1610 for (i = 0; i < len; i++)
1611 target[i] = PyUnicode_READ(kind, data, i);
1612 }
1613 else
1614 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1615 if (copy_null)
1616 target[len] = 0;
1617 return target;
1618}
1619
1620Py_UCS4*
1621PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1622 int copy_null)
1623{
1624 if (target == NULL || targetsize < 1) {
1625 PyErr_BadInternalCall();
1626 return NULL;
1627 }
1628 return as_ucs4(string, target, targetsize, copy_null);
1629}
1630
1631Py_UCS4*
1632PyUnicode_AsUCS4Copy(PyObject *string)
1633{
1634 return as_ucs4(string, NULL, 0, 1);
1635}
1636
1637#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001638
Alexander Belopolsky40018472011-02-26 01:02:56 +00001639PyObject *
1640PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001643 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001645 PyErr_BadInternalCall();
1646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 }
1648
Martin v. Löwis790465f2008-04-05 20:41:37 +00001649 if (size == -1) {
1650 size = wcslen(w);
1651 }
1652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654}
1655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001657
Walter Dörwald346737f2007-05-31 10:44:43 +00001658static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001659makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1660 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001661{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001662 *fmt++ = '%';
1663 if (width) {
1664 if (zeropad)
1665 *fmt++ = '0';
1666 fmt += sprintf(fmt, "%d", width);
1667 }
1668 if (precision)
1669 fmt += sprintf(fmt, ".%d", precision);
1670 if (longflag)
1671 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001672 else if (longlongflag) {
1673 /* longlongflag should only ever be nonzero on machines with
1674 HAVE_LONG_LONG defined */
1675#ifdef HAVE_LONG_LONG
1676 char *f = PY_FORMAT_LONG_LONG;
1677 while (*f)
1678 *fmt++ = *f++;
1679#else
1680 /* we shouldn't ever get here */
1681 assert(0);
1682 *fmt++ = 'l';
1683#endif
1684 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001685 else if (size_tflag) {
1686 char *f = PY_FORMAT_SIZE_T;
1687 while (*f)
1688 *fmt++ = *f++;
1689 }
1690 *fmt++ = c;
1691 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001692}
1693
Victor Stinner96865452011-03-01 23:44:09 +00001694/* helper for PyUnicode_FromFormatV() */
1695
1696static const char*
1697parse_format_flags(const char *f,
1698 int *p_width, int *p_precision,
1699 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1700{
1701 int width, precision, longflag, longlongflag, size_tflag;
1702
1703 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1704 f++;
1705 width = 0;
1706 while (Py_ISDIGIT((unsigned)*f))
1707 width = (width*10) + *f++ - '0';
1708 precision = 0;
1709 if (*f == '.') {
1710 f++;
1711 while (Py_ISDIGIT((unsigned)*f))
1712 precision = (precision*10) + *f++ - '0';
1713 if (*f == '%') {
1714 /* "%.3%s" => f points to "3" */
1715 f--;
1716 }
1717 }
1718 if (*f == '\0') {
1719 /* bogus format "%.1" => go backward, f points to "1" */
1720 f--;
1721 }
1722 if (p_width != NULL)
1723 *p_width = width;
1724 if (p_precision != NULL)
1725 *p_precision = precision;
1726
1727 /* Handle %ld, %lu, %lld and %llu. */
1728 longflag = 0;
1729 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001730 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001731
1732 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001733 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001734 longflag = 1;
1735 ++f;
1736 }
1737#ifdef HAVE_LONG_LONG
1738 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001739 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001740 longlongflag = 1;
1741 f += 2;
1742 }
1743#endif
1744 }
1745 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001746 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001747 size_tflag = 1;
1748 ++f;
1749 }
1750 if (p_longflag != NULL)
1751 *p_longflag = longflag;
1752 if (p_longlongflag != NULL)
1753 *p_longlongflag = longlongflag;
1754 if (p_size_tflag != NULL)
1755 *p_size_tflag = size_tflag;
1756 return f;
1757}
1758
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001759/* maximum number of characters required for output of %ld. 21 characters
1760 allows for 64-bit integers (in decimal) and an optional sign. */
1761#define MAX_LONG_CHARS 21
1762/* maximum number of characters required for output of %lld.
1763 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1764 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1765#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1766
Walter Dörwaldd2034312007-05-18 16:29:38 +00001767PyObject *
1768PyUnicode_FromFormatV(const char *format, va_list vargs)
1769{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001770 va_list count;
1771 Py_ssize_t callcount = 0;
1772 PyObject **callresults = NULL;
1773 PyObject **callresult = NULL;
1774 Py_ssize_t n = 0;
1775 int width = 0;
1776 int precision = 0;
1777 int zeropad;
1778 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001780 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001781 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1783 Py_UCS4 argmaxchar;
1784 Py_ssize_t numbersize = 0;
1785 char *numberresults = NULL;
1786 char *numberresult = NULL;
1787 Py_ssize_t i;
1788 int kind;
1789 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001790
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001791 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001792 /* step 1: count the number of %S/%R/%A/%s format specifications
1793 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1794 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795 * result in an array)
1796 * also esimate a upper bound for all the number formats in the string,
1797 * numbers will be formated in step 3 and be keept in a '\0'-separated
1798 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001799 for (f = format; *f; f++) {
1800 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001801 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1803 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1804 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1805 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001808#ifdef HAVE_LONG_LONG
1809 if (longlongflag) {
1810 if (width < MAX_LONG_LONG_CHARS)
1811 width = MAX_LONG_LONG_CHARS;
1812 }
1813 else
1814#endif
1815 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1816 including sign. Decimal takes the most space. This
1817 isn't enough for octal. If a width is specified we
1818 need more (which we allocate later). */
1819 if (width < MAX_LONG_CHARS)
1820 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821
1822 /* account for the size + '\0' to separate numbers
1823 inside of the numberresults buffer */
1824 numbersize += (width + 1);
1825 }
1826 }
1827 else if ((unsigned char)*f > 127) {
1828 PyErr_Format(PyExc_ValueError,
1829 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1830 "string, got a non-ASCII byte: 0x%02x",
1831 (unsigned char)*f);
1832 return NULL;
1833 }
1834 }
1835 /* step 2: allocate memory for the results of
1836 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1837 if (callcount) {
1838 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1839 if (!callresults) {
1840 PyErr_NoMemory();
1841 return NULL;
1842 }
1843 callresult = callresults;
1844 }
1845 /* step 2.5: allocate memory for the results of formating numbers */
1846 if (numbersize) {
1847 numberresults = PyObject_Malloc(numbersize);
1848 if (!numberresults) {
1849 PyErr_NoMemory();
1850 goto fail;
1851 }
1852 numberresult = numberresults;
1853 }
1854
1855 /* step 3: format numbers and figure out how large a buffer we need */
1856 for (f = format; *f; f++) {
1857 if (*f == '%') {
1858 const char* p;
1859 int longflag;
1860 int longlongflag;
1861 int size_tflag;
1862 int numprinted;
1863
1864 p = f;
1865 zeropad = (f[1] == '0');
1866 f = parse_format_flags(f, &width, &precision,
1867 &longflag, &longlongflag, &size_tflag);
1868 switch (*f) {
1869 case 'c':
1870 {
1871 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001872 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 n++;
1874 break;
1875 }
1876 case '%':
1877 n++;
1878 break;
1879 case 'i':
1880 case 'd':
1881 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1882 width, precision, *f);
1883 if (longflag)
1884 numprinted = sprintf(numberresult, fmt,
1885 va_arg(count, long));
1886#ifdef HAVE_LONG_LONG
1887 else if (longlongflag)
1888 numprinted = sprintf(numberresult, fmt,
1889 va_arg(count, PY_LONG_LONG));
1890#endif
1891 else if (size_tflag)
1892 numprinted = sprintf(numberresult, fmt,
1893 va_arg(count, Py_ssize_t));
1894 else
1895 numprinted = sprintf(numberresult, fmt,
1896 va_arg(count, int));
1897 n += numprinted;
1898 /* advance by +1 to skip over the '\0' */
1899 numberresult += (numprinted + 1);
1900 assert(*(numberresult - 1) == '\0');
1901 assert(*(numberresult - 2) != '\0');
1902 assert(numprinted >= 0);
1903 assert(numberresult <= numberresults + numbersize);
1904 break;
1905 case 'u':
1906 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1907 width, precision, 'u');
1908 if (longflag)
1909 numprinted = sprintf(numberresult, fmt,
1910 va_arg(count, unsigned long));
1911#ifdef HAVE_LONG_LONG
1912 else if (longlongflag)
1913 numprinted = sprintf(numberresult, fmt,
1914 va_arg(count, unsigned PY_LONG_LONG));
1915#endif
1916 else if (size_tflag)
1917 numprinted = sprintf(numberresult, fmt,
1918 va_arg(count, size_t));
1919 else
1920 numprinted = sprintf(numberresult, fmt,
1921 va_arg(count, unsigned int));
1922 n += numprinted;
1923 numberresult += (numprinted + 1);
1924 assert(*(numberresult - 1) == '\0');
1925 assert(*(numberresult - 2) != '\0');
1926 assert(numprinted >= 0);
1927 assert(numberresult <= numberresults + numbersize);
1928 break;
1929 case 'x':
1930 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1931 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1932 n += numprinted;
1933 numberresult += (numprinted + 1);
1934 assert(*(numberresult - 1) == '\0');
1935 assert(*(numberresult - 2) != '\0');
1936 assert(numprinted >= 0);
1937 assert(numberresult <= numberresults + numbersize);
1938 break;
1939 case 'p':
1940 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1941 /* %p is ill-defined: ensure leading 0x. */
1942 if (numberresult[1] == 'X')
1943 numberresult[1] = 'x';
1944 else if (numberresult[1] != 'x') {
1945 memmove(numberresult + 2, numberresult,
1946 strlen(numberresult) + 1);
1947 numberresult[0] = '0';
1948 numberresult[1] = 'x';
1949 numprinted += 2;
1950 }
1951 n += numprinted;
1952 numberresult += (numprinted + 1);
1953 assert(*(numberresult - 1) == '\0');
1954 assert(*(numberresult - 2) != '\0');
1955 assert(numprinted >= 0);
1956 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001957 break;
1958 case 's':
1959 {
1960 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001961 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001962 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1963 if (!str)
1964 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001965 /* since PyUnicode_DecodeUTF8 returns already flexible
1966 unicode objects, there is no need to call ready on them */
1967 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001968 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001970 /* Remember the str and switch to the next slot */
1971 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001972 break;
1973 }
1974 case 'U':
1975 {
1976 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001977 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (PyUnicode_READY(obj) == -1)
1979 goto fail;
1980 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001981 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001983 break;
1984 }
1985 case 'V':
1986 {
1987 PyObject *obj = va_arg(count, PyObject *);
1988 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001989 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02001991 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001992 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 if (PyUnicode_READY(obj) == -1)
1994 goto fail;
1995 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001996 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001998 *callresult++ = NULL;
1999 }
2000 else {
2001 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2002 if (!str_obj)
2003 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002005 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002007 *callresult++ = str_obj;
2008 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002009 break;
2010 }
2011 case 'S':
2012 {
2013 PyObject *obj = va_arg(count, PyObject *);
2014 PyObject *str;
2015 assert(obj);
2016 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002018 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002020 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002022 /* Remember the str and switch to the next slot */
2023 *callresult++ = str;
2024 break;
2025 }
2026 case 'R':
2027 {
2028 PyObject *obj = va_arg(count, PyObject *);
2029 PyObject *repr;
2030 assert(obj);
2031 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002033 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002035 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002037 /* Remember the repr and switch to the next slot */
2038 *callresult++ = repr;
2039 break;
2040 }
2041 case 'A':
2042 {
2043 PyObject *obj = va_arg(count, PyObject *);
2044 PyObject *ascii;
2045 assert(obj);
2046 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002048 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002050 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002052 /* Remember the repr and switch to the next slot */
2053 *callresult++ = ascii;
2054 break;
2055 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 default:
2057 /* if we stumble upon an unknown
2058 formatting code, copy the rest of
2059 the format string to the output
2060 string. (we cannot just skip the
2061 code, since there's no way to know
2062 what's in the argument list) */
2063 n += strlen(p);
2064 goto expand;
2065 }
2066 } else
2067 n++;
2068 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002070 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002072 we don't have to resize the string.
2073 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 if (!string)
2076 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077 kind = PyUnicode_KIND(string);
2078 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002079 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002083 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002084 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002085
2086 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2088 /* checking for == because the last argument could be a empty
2089 string, which causes i to point to end, the assert at the end of
2090 the loop */
2091 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002092
Benjamin Peterson14339b62009-01-31 16:36:08 +00002093 switch (*f) {
2094 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002095 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096 const int ordinal = va_arg(vargs, int);
2097 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002099 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002100 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002101 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 case 'p':
2105 /* unused, since we already have the result */
2106 if (*f == 'p')
2107 (void) va_arg(vargs, void *);
2108 else
2109 (void) va_arg(vargs, int);
2110 /* extract the result from numberresults and append. */
2111 for (; *numberresult; ++i, ++numberresult)
2112 PyUnicode_WRITE(kind, data, i, *numberresult);
2113 /* skip over the separating '\0' */
2114 assert(*numberresult == '\0');
2115 numberresult++;
2116 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 break;
2118 case 's':
2119 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002120 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002122 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 size = PyUnicode_GET_LENGTH(*callresult);
2124 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002125 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2126 *callresult, 0,
2127 size) < 0)
2128 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002130 /* We're done with the unicode()/repr() => forget it */
2131 Py_DECREF(*callresult);
2132 /* switch to next unicode()/repr() result */
2133 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002134 break;
2135 }
2136 case 'U':
2137 {
2138 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 Py_ssize_t size;
2140 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2141 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002142 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2143 obj, 0,
2144 size) < 0)
2145 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002147 break;
2148 }
2149 case 'V':
2150 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002153 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 size = PyUnicode_GET_LENGTH(obj);
2156 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002157 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2158 obj, 0,
2159 size) < 0)
2160 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002162 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 size = PyUnicode_GET_LENGTH(*callresult);
2164 assert(PyUnicode_KIND(*callresult) <=
2165 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002166 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2167 *callresult,
2168 0, size) < 0)
2169 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002171 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002172 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002173 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 break;
2175 }
2176 case 'S':
2177 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002178 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002180 /* unused, since we already have the result */
2181 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002183 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2184 *callresult, 0,
2185 PyUnicode_GET_LENGTH(*callresult)) < 0)
2186 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 /* We're done with the unicode()/repr() => forget it */
2189 Py_DECREF(*callresult);
2190 /* switch to next unicode()/repr() result */
2191 ++callresult;
2192 break;
2193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002194 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002196 break;
2197 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 for (; *p; ++p, ++i)
2199 PyUnicode_WRITE(kind, data, i, *p);
2200 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002201 goto end;
2202 }
Victor Stinner1205f272010-09-11 00:54:47 +00002203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 else {
2205 assert(i < PyUnicode_GET_LENGTH(string));
2206 PyUnicode_WRITE(kind, data, i++, *f);
2207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002210
Benjamin Peterson29060642009-01-31 22:14:21 +00002211 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 if (callresults)
2213 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 if (numberresults)
2215 PyObject_Free(numberresults);
2216 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002217 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 if (callresults) {
2219 PyObject **callresult2 = callresults;
2220 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002221 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 ++callresult2;
2223 }
2224 PyObject_Free(callresults);
2225 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 if (numberresults)
2227 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002229}
2230
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231PyObject *
2232PyUnicode_FromFormat(const char *format, ...)
2233{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002234 PyObject* ret;
2235 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002236
2237#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002238 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002241#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002242 ret = PyUnicode_FromFormatV(format, vargs);
2243 va_end(vargs);
2244 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002245}
2246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247#ifdef HAVE_WCHAR_H
2248
Victor Stinner5593d8a2010-10-02 11:11:27 +00002249/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2250 convert a Unicode object to a wide character string.
2251
Victor Stinnerd88d9832011-09-06 02:00:05 +02002252 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002253 character) required to convert the unicode object. Ignore size argument.
2254
Victor Stinnerd88d9832011-09-06 02:00:05 +02002255 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002256 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002257 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002258static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002259unicode_aswidechar(PyUnicodeObject *unicode,
2260 wchar_t *w,
2261 Py_ssize_t size)
2262{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002263 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 const wchar_t *wstr;
2265
2266 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2267 if (wstr == NULL)
2268 return -1;
2269
Victor Stinner5593d8a2010-10-02 11:11:27 +00002270 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002271 if (size > res)
2272 size = res + 1;
2273 else
2274 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002276 return res;
2277 }
2278 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002280}
2281
2282Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002283PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002284 wchar_t *w,
2285 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286{
2287 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002288 PyErr_BadInternalCall();
2289 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002290 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002291 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292}
2293
Victor Stinner137c34c2010-09-29 10:25:54 +00002294wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002295PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002296 Py_ssize_t *size)
2297{
2298 wchar_t* buffer;
2299 Py_ssize_t buflen;
2300
2301 if (unicode == NULL) {
2302 PyErr_BadInternalCall();
2303 return NULL;
2304 }
2305
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002306 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307 if (buflen == -1)
2308 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002309 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313
Victor Stinner137c34c2010-09-29 10:25:54 +00002314 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2315 if (buffer == NULL) {
2316 PyErr_NoMemory();
2317 return NULL;
2318 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002319 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 if (buflen == -1)
2321 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002322 if (size != NULL)
2323 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002324 return buffer;
2325}
2326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328
Alexander Belopolsky40018472011-02-26 01:02:56 +00002329PyObject *
2330PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002331{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002333 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002334 PyErr_SetString(PyExc_ValueError,
2335 "chr() arg not in range(0x110000)");
2336 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002337 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339 if (ordinal < 256)
2340 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 v = PyUnicode_New(1, ordinal);
2343 if (v == NULL)
2344 return NULL;
2345 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2346 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002347}
2348
Alexander Belopolsky40018472011-02-26 01:02:56 +00002349PyObject *
2350PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002352 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002354 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002355 if (PyUnicode_READY(obj))
2356 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002357 Py_INCREF(obj);
2358 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002359 }
2360 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002361 /* For a Unicode subtype that's not a Unicode object,
2362 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002363 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002364 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002365 PyErr_Format(PyExc_TypeError,
2366 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002367 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002368 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002369}
2370
Alexander Belopolsky40018472011-02-26 01:02:56 +00002371PyObject *
2372PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002373 const char *encoding,
2374 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002375{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002376 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002377 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002378
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002380 PyErr_BadInternalCall();
2381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002383
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002384 /* Decoding bytes objects is the most common case and should be fast */
2385 if (PyBytes_Check(obj)) {
2386 if (PyBytes_GET_SIZE(obj) == 0) {
2387 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002388 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002389 }
2390 else {
2391 v = PyUnicode_Decode(
2392 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2393 encoding, errors);
2394 }
2395 return v;
2396 }
2397
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002398 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002399 PyErr_SetString(PyExc_TypeError,
2400 "decoding str is not supported");
2401 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002402 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002403
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002404 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2405 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2406 PyErr_Format(PyExc_TypeError,
2407 "coercing to str: need bytes, bytearray "
2408 "or buffer-like object, %.80s found",
2409 Py_TYPE(obj)->tp_name);
2410 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002411 }
Tim Petersced69f82003-09-16 20:30:58 +00002412
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002413 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002415 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416 }
Tim Petersced69f82003-09-16 20:30:58 +00002417 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002418 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002419
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002420 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002421 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422}
2423
Victor Stinner600d3be2010-06-10 12:00:55 +00002424/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002425 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2426 1 on success. */
2427static int
2428normalize_encoding(const char *encoding,
2429 char *lower,
2430 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002432 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002433 char *l;
2434 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002435
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002436 e = encoding;
2437 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002438 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002439 while (*e) {
2440 if (l == l_end)
2441 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002442 if (Py_ISUPPER(*e)) {
2443 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002444 }
2445 else if (*e == '_') {
2446 *l++ = '-';
2447 e++;
2448 }
2449 else {
2450 *l++ = *e++;
2451 }
2452 }
2453 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002454 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002455}
2456
Alexander Belopolsky40018472011-02-26 01:02:56 +00002457PyObject *
2458PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002459 Py_ssize_t size,
2460 const char *encoding,
2461 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002462{
2463 PyObject *buffer = NULL, *unicode;
2464 Py_buffer info;
2465 char lower[11]; /* Enough for any encoding shortcut */
2466
2467 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002468 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002469
2470 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002471 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002472 if ((strcmp(lower, "utf-8") == 0) ||
2473 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002474 return PyUnicode_DecodeUTF8(s, size, errors);
2475 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002476 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002477 (strcmp(lower, "iso-8859-1") == 0))
2478 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002479#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002480 else if (strcmp(lower, "mbcs") == 0)
2481 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002482#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002483 else if (strcmp(lower, "ascii") == 0)
2484 return PyUnicode_DecodeASCII(s, size, errors);
2485 else if (strcmp(lower, "utf-16") == 0)
2486 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2487 else if (strcmp(lower, "utf-32") == 0)
2488 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490
2491 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002492 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002493 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002494 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002495 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 if (buffer == NULL)
2497 goto onError;
2498 unicode = PyCodec_Decode(buffer, encoding, errors);
2499 if (unicode == NULL)
2500 goto onError;
2501 if (!PyUnicode_Check(unicode)) {
2502 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002503 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002504 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 Py_DECREF(unicode);
2506 goto onError;
2507 }
2508 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 if (PyUnicode_READY(unicode)) {
2510 Py_DECREF(unicode);
2511 return NULL;
2512 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002514
Benjamin Peterson29060642009-01-31 22:14:21 +00002515 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 Py_XDECREF(buffer);
2517 return NULL;
2518}
2519
Alexander Belopolsky40018472011-02-26 01:02:56 +00002520PyObject *
2521PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002522 const char *encoding,
2523 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002524{
2525 PyObject *v;
2526
2527 if (!PyUnicode_Check(unicode)) {
2528 PyErr_BadArgument();
2529 goto onError;
2530 }
2531
2532 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002534
2535 /* Decode via the codec registry */
2536 v = PyCodec_Decode(unicode, encoding, errors);
2537 if (v == NULL)
2538 goto onError;
2539 return v;
2540
Benjamin Peterson29060642009-01-31 22:14:21 +00002541 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002542 return NULL;
2543}
2544
Alexander Belopolsky40018472011-02-26 01:02:56 +00002545PyObject *
2546PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002547 const char *encoding,
2548 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002549{
2550 PyObject *v;
2551
2552 if (!PyUnicode_Check(unicode)) {
2553 PyErr_BadArgument();
2554 goto onError;
2555 }
2556
2557 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002558 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002559
2560 /* Decode via the codec registry */
2561 v = PyCodec_Decode(unicode, encoding, errors);
2562 if (v == NULL)
2563 goto onError;
2564 if (!PyUnicode_Check(v)) {
2565 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002566 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002567 Py_TYPE(v)->tp_name);
2568 Py_DECREF(v);
2569 goto onError;
2570 }
2571 return v;
2572
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002574 return NULL;
2575}
2576
Alexander Belopolsky40018472011-02-26 01:02:56 +00002577PyObject *
2578PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002579 Py_ssize_t size,
2580 const char *encoding,
2581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582{
2583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002584
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 unicode = PyUnicode_FromUnicode(s, size);
2586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2589 Py_DECREF(unicode);
2590 return v;
2591}
2592
Alexander Belopolsky40018472011-02-26 01:02:56 +00002593PyObject *
2594PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002595 const char *encoding,
2596 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002597{
2598 PyObject *v;
2599
2600 if (!PyUnicode_Check(unicode)) {
2601 PyErr_BadArgument();
2602 goto onError;
2603 }
2604
2605 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002607
2608 /* Encode via the codec registry */
2609 v = PyCodec_Encode(unicode, encoding, errors);
2610 if (v == NULL)
2611 goto onError;
2612 return v;
2613
Benjamin Peterson29060642009-01-31 22:14:21 +00002614 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002615 return NULL;
2616}
2617
Victor Stinnerad158722010-10-27 00:25:46 +00002618PyObject *
2619PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002620{
Victor Stinner99b95382011-07-04 14:23:54 +02002621#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002622 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2623 PyUnicode_GET_SIZE(unicode),
2624 NULL);
2625#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002627#else
Victor Stinner793b5312011-04-27 00:24:21 +02002628 PyInterpreterState *interp = PyThreadState_GET()->interp;
2629 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2630 cannot use it to encode and decode filenames before it is loaded. Load
2631 the Python codec requires to encode at least its own filename. Use the C
2632 version of the locale codec until the codec registry is initialized and
2633 the Python codec is loaded.
2634
2635 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2636 cannot only rely on it: check also interp->fscodec_initialized for
2637 subinterpreters. */
2638 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002639 return PyUnicode_AsEncodedString(unicode,
2640 Py_FileSystemDefaultEncoding,
2641 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002642 }
2643 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002644 /* locale encoding with surrogateescape */
2645 wchar_t *wchar;
2646 char *bytes;
2647 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002648 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002649
2650 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2651 if (wchar == NULL)
2652 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002653 bytes = _Py_wchar2char(wchar, &error_pos);
2654 if (bytes == NULL) {
2655 if (error_pos != (size_t)-1) {
2656 char *errmsg = strerror(errno);
2657 PyObject *exc = NULL;
2658 if (errmsg == NULL)
2659 errmsg = "Py_wchar2char() failed";
2660 raise_encode_exception(&exc,
2661 "filesystemencoding",
2662 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2663 error_pos, error_pos+1,
2664 errmsg);
2665 Py_XDECREF(exc);
2666 }
2667 else
2668 PyErr_NoMemory();
2669 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002671 }
2672 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002673
2674 bytes_obj = PyBytes_FromString(bytes);
2675 PyMem_Free(bytes);
2676 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002677 }
Victor Stinnerad158722010-10-27 00:25:46 +00002678#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002679}
2680
Alexander Belopolsky40018472011-02-26 01:02:56 +00002681PyObject *
2682PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002683 const char *encoding,
2684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685{
2686 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002687 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002688
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 if (!PyUnicode_Check(unicode)) {
2690 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 }
Fred Drakee4315f52000-05-09 19:53:39 +00002693
Victor Stinner2f283c22011-03-02 01:21:46 +00002694 if (encoding == NULL) {
2695 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002696 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002697 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002699 }
Fred Drakee4315f52000-05-09 19:53:39 +00002700
2701 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002702 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002703 if ((strcmp(lower, "utf-8") == 0) ||
2704 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002705 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002706 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002708 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002710 }
Victor Stinner37296e82010-06-10 13:36:23 +00002711 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002712 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002713 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002715#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002716 else if (strcmp(lower, "mbcs") == 0)
2717 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2718 PyUnicode_GET_SIZE(unicode),
2719 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002720#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002721 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724
2725 /* Encode via the codec registry */
2726 v = PyCodec_Encode(unicode, encoding, errors);
2727 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002728 return NULL;
2729
2730 /* The normal path */
2731 if (PyBytes_Check(v))
2732 return v;
2733
2734 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002735 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002736 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002737 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002738
2739 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2740 "encoder %s returned bytearray instead of bytes",
2741 encoding);
2742 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002743 Py_DECREF(v);
2744 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002745 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002746
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002747 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2748 Py_DECREF(v);
2749 return b;
2750 }
2751
2752 PyErr_Format(PyExc_TypeError,
2753 "encoder did not return a bytes object (type=%.400s)",
2754 Py_TYPE(v)->tp_name);
2755 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002756 return NULL;
2757}
2758
Alexander Belopolsky40018472011-02-26 01:02:56 +00002759PyObject *
2760PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002761 const char *encoding,
2762 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002763{
2764 PyObject *v;
2765
2766 if (!PyUnicode_Check(unicode)) {
2767 PyErr_BadArgument();
2768 goto onError;
2769 }
2770
2771 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002772 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002773
2774 /* Encode via the codec registry */
2775 v = PyCodec_Encode(unicode, encoding, errors);
2776 if (v == NULL)
2777 goto onError;
2778 if (!PyUnicode_Check(v)) {
2779 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002780 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002781 Py_TYPE(v)->tp_name);
2782 Py_DECREF(v);
2783 goto onError;
2784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002786
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 return NULL;
2789}
2790
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002791PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002792PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002793 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002794 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2795}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002796
Christian Heimes5894ba72007-11-04 11:43:14 +00002797PyObject*
2798PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2799{
Victor Stinner99b95382011-07-04 14:23:54 +02002800#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002801 return PyUnicode_DecodeMBCS(s, size, NULL);
2802#elif defined(__APPLE__)
2803 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2804#else
Victor Stinner793b5312011-04-27 00:24:21 +02002805 PyInterpreterState *interp = PyThreadState_GET()->interp;
2806 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2807 cannot use it to encode and decode filenames before it is loaded. Load
2808 the Python codec requires to encode at least its own filename. Use the C
2809 version of the locale codec until the codec registry is initialized and
2810 the Python codec is loaded.
2811
2812 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2813 cannot only rely on it: check also interp->fscodec_initialized for
2814 subinterpreters. */
2815 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002816 return PyUnicode_Decode(s, size,
2817 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002818 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002819 }
2820 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002821 /* locale encoding with surrogateescape */
2822 wchar_t *wchar;
2823 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002824 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002825
2826 if (s[size] != '\0' || size != strlen(s)) {
2827 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2828 return NULL;
2829 }
2830
Victor Stinner168e1172010-10-16 23:16:16 +00002831 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002832 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002833 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002834
Victor Stinner168e1172010-10-16 23:16:16 +00002835 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002836 PyMem_Free(wchar);
2837 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002838 }
Victor Stinnerad158722010-10-27 00:25:46 +00002839#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002840}
2841
Martin v. Löwis011e8422009-05-05 04:43:17 +00002842
2843int
2844PyUnicode_FSConverter(PyObject* arg, void* addr)
2845{
2846 PyObject *output = NULL;
2847 Py_ssize_t size;
2848 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002849 if (arg == NULL) {
2850 Py_DECREF(*(PyObject**)addr);
2851 return 1;
2852 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002853 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002854 output = arg;
2855 Py_INCREF(output);
2856 }
2857 else {
2858 arg = PyUnicode_FromObject(arg);
2859 if (!arg)
2860 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002861 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002862 Py_DECREF(arg);
2863 if (!output)
2864 return 0;
2865 if (!PyBytes_Check(output)) {
2866 Py_DECREF(output);
2867 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2868 return 0;
2869 }
2870 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002871 size = PyBytes_GET_SIZE(output);
2872 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002873 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002874 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002875 Py_DECREF(output);
2876 return 0;
2877 }
2878 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002879 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002880}
2881
2882
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002883int
2884PyUnicode_FSDecoder(PyObject* arg, void* addr)
2885{
2886 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002887 if (arg == NULL) {
2888 Py_DECREF(*(PyObject**)addr);
2889 return 1;
2890 }
2891 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892 if (PyUnicode_READY(arg))
2893 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002894 output = arg;
2895 Py_INCREF(output);
2896 }
2897 else {
2898 arg = PyBytes_FromObject(arg);
2899 if (!arg)
2900 return 0;
2901 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2902 PyBytes_GET_SIZE(arg));
2903 Py_DECREF(arg);
2904 if (!output)
2905 return 0;
2906 if (!PyUnicode_Check(output)) {
2907 Py_DECREF(output);
2908 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2909 return 0;
2910 }
2911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2913 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002914 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2915 Py_DECREF(output);
2916 return 0;
2917 }
2918 *(PyObject**)addr = output;
2919 return Py_CLEANUP_SUPPORTED;
2920}
2921
2922
Martin v. Löwis5b222132007-06-10 09:51:05 +00002923char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002924PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002925{
Christian Heimesf3863112007-11-22 07:46:41 +00002926 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002927 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2928
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002929 if (!PyUnicode_Check(unicode)) {
2930 PyErr_BadArgument();
2931 return NULL;
2932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002933 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002934 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002935
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002936 if (PyUnicode_UTF8(unicode) == NULL) {
2937 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002938 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2939 if (bytes == NULL)
2940 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002941 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2942 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943 Py_DECREF(bytes);
2944 return NULL;
2945 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002946 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2947 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 Py_DECREF(bytes);
2949 }
2950
2951 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002952 *psize = PyUnicode_UTF8_LENGTH(unicode);
2953 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002954}
2955
2956char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2960}
2961
2962#ifdef Py_DEBUG
2963int unicode_as_unicode_calls = 0;
2964#endif
2965
2966
2967Py_UNICODE *
2968PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2969{
2970 PyUnicodeObject *u;
2971 const unsigned char *one_byte;
2972#if SIZEOF_WCHAR_T == 4
2973 const Py_UCS2 *two_bytes;
2974#else
2975 const Py_UCS4 *four_bytes;
2976 const Py_UCS4 *ucs4_end;
2977 Py_ssize_t num_surrogates;
2978#endif
2979 wchar_t *w;
2980 wchar_t *wchar_end;
2981
2982 if (!PyUnicode_Check(unicode)) {
2983 PyErr_BadArgument();
2984 return NULL;
2985 }
2986 u = (PyUnicodeObject*)unicode;
2987 if (_PyUnicode_WSTR(u) == NULL) {
2988 /* Non-ASCII compact unicode object */
2989 assert(_PyUnicode_KIND(u) != 0);
2990 assert(PyUnicode_IS_READY(u));
2991
2992#ifdef Py_DEBUG
2993 ++unicode_as_unicode_calls;
2994#endif
2995
2996 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2997#if SIZEOF_WCHAR_T == 2
2998 four_bytes = PyUnicode_4BYTE_DATA(u);
2999 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3000 num_surrogates = 0;
3001
3002 for (; four_bytes < ucs4_end; ++four_bytes) {
3003 if (*four_bytes > 0xFFFF)
3004 ++num_surrogates;
3005 }
3006
3007 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3008 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3009 if (!_PyUnicode_WSTR(u)) {
3010 PyErr_NoMemory();
3011 return NULL;
3012 }
3013 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3014
3015 w = _PyUnicode_WSTR(u);
3016 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3017 four_bytes = PyUnicode_4BYTE_DATA(u);
3018 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3019 if (*four_bytes > 0xFFFF) {
3020 /* encode surrogate pair in this case */
3021 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3022 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3023 }
3024 else
3025 *w = *four_bytes;
3026
3027 if (w > wchar_end) {
3028 assert(0 && "Miscalculated string end");
3029 }
3030 }
3031 *w = 0;
3032#else
3033 /* sizeof(wchar_t) == 4 */
3034 Py_FatalError("Impossible unicode object state, wstr and str "
3035 "should share memory already.");
3036 return NULL;
3037#endif
3038 }
3039 else {
3040 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3041 (_PyUnicode_LENGTH(u) + 1));
3042 if (!_PyUnicode_WSTR(u)) {
3043 PyErr_NoMemory();
3044 return NULL;
3045 }
3046 if (!PyUnicode_IS_COMPACT_ASCII(u))
3047 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3048 w = _PyUnicode_WSTR(u);
3049 wchar_end = w + _PyUnicode_LENGTH(u);
3050
3051 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3052 one_byte = PyUnicode_1BYTE_DATA(u);
3053 for (; w < wchar_end; ++one_byte, ++w)
3054 *w = *one_byte;
3055 /* null-terminate the wstr */
3056 *w = 0;
3057 }
3058 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3059#if SIZEOF_WCHAR_T == 4
3060 two_bytes = PyUnicode_2BYTE_DATA(u);
3061 for (; w < wchar_end; ++two_bytes, ++w)
3062 *w = *two_bytes;
3063 /* null-terminate the wstr */
3064 *w = 0;
3065#else
3066 /* sizeof(wchar_t) == 2 */
3067 PyObject_FREE(_PyUnicode_WSTR(u));
3068 _PyUnicode_WSTR(u) = NULL;
3069 Py_FatalError("Impossible unicode object state, wstr "
3070 "and str should share memory already.");
3071 return NULL;
3072#endif
3073 }
3074 else {
3075 assert(0 && "This should never happen.");
3076 }
3077 }
3078 }
3079 if (size != NULL)
3080 *size = PyUnicode_WSTR_LENGTH(u);
3081 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003082}
3083
Alexander Belopolsky40018472011-02-26 01:02:56 +00003084Py_UNICODE *
3085PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088}
3089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091Py_ssize_t
3092PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093{
3094 if (!PyUnicode_Check(unicode)) {
3095 PyErr_BadArgument();
3096 goto onError;
3097 }
3098 return PyUnicode_GET_SIZE(unicode);
3099
Benjamin Peterson29060642009-01-31 22:14:21 +00003100 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 return -1;
3102}
3103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003104Py_ssize_t
3105PyUnicode_GetLength(PyObject *unicode)
3106{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003107 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 PyErr_BadArgument();
3109 return -1;
3110 }
3111
3112 return PyUnicode_GET_LENGTH(unicode);
3113}
3114
3115Py_UCS4
3116PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3117{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003118 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3119 PyErr_BadArgument();
3120 return (Py_UCS4)-1;
3121 }
3122 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3123 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003124 return (Py_UCS4)-1;
3125 }
3126 return PyUnicode_READ_CHAR(unicode, index);
3127}
3128
3129int
3130PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3131{
3132 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003133 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003134 return -1;
3135 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003136 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3137 PyErr_SetString(PyExc_IndexError, "string index out of range");
3138 return -1;
3139 }
3140 if (_PyUnicode_Dirty(unicode))
3141 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003142 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3143 index, ch);
3144 return 0;
3145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147const char *
3148PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003149{
Victor Stinner42cb4622010-09-01 19:39:01 +00003150 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003151}
3152
Victor Stinner554f3f02010-06-16 23:33:54 +00003153/* create or adjust a UnicodeDecodeError */
3154static void
3155make_decode_exception(PyObject **exceptionObject,
3156 const char *encoding,
3157 const char *input, Py_ssize_t length,
3158 Py_ssize_t startpos, Py_ssize_t endpos,
3159 const char *reason)
3160{
3161 if (*exceptionObject == NULL) {
3162 *exceptionObject = PyUnicodeDecodeError_Create(
3163 encoding, input, length, startpos, endpos, reason);
3164 }
3165 else {
3166 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3167 goto onError;
3168 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3169 goto onError;
3170 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3171 goto onError;
3172 }
3173 return;
3174
3175onError:
3176 Py_DECREF(*exceptionObject);
3177 *exceptionObject = NULL;
3178}
3179
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003180/* error handling callback helper:
3181 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003182 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003183 and adjust various state variables.
3184 return 0 on success, -1 on error
3185*/
3186
Alexander Belopolsky40018472011-02-26 01:02:56 +00003187static int
3188unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003189 const char *encoding, const char *reason,
3190 const char **input, const char **inend, Py_ssize_t *startinpos,
3191 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3192 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003194 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003195
3196 PyObject *restuple = NULL;
3197 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003198 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003199 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003200 Py_ssize_t requiredsize;
3201 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003202 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003203 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003204 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 int res = -1;
3206
3207 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 *errorHandler = PyCodec_LookupError(errors);
3209 if (*errorHandler == NULL)
3210 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 }
3212
Victor Stinner554f3f02010-06-16 23:33:54 +00003213 make_decode_exception(exceptionObject,
3214 encoding,
3215 *input, *inend - *input,
3216 *startinpos, *endinpos,
3217 reason);
3218 if (*exceptionObject == NULL)
3219 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220
3221 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3222 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003224 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003225 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003227 }
3228 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003230
3231 /* Copy back the bytes variables, which might have been modified by the
3232 callback */
3233 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3234 if (!inputobj)
3235 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003236 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003238 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003239 *input = PyBytes_AS_STRING(inputobj);
3240 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003241 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003242 /* we can DECREF safely, as the exception has another reference,
3243 so the object won't go away. */
3244 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003248 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3250 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003251 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252
3253 /* need more space? (at least enough for what we
3254 have+the replacement+the rest of the string (starting
3255 at the new input position), so we won't have to check space
3256 when there are no errors in the rest of the string) */
3257 repptr = PyUnicode_AS_UNICODE(repunicode);
3258 repsize = PyUnicode_GET_SIZE(repunicode);
3259 requiredsize = *outpos + repsize + insize-newpos;
3260 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 if (requiredsize<2*outsize)
3262 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003263 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003264 goto onError;
3265 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266 }
3267 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003268 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003269 Py_UNICODE_COPY(*outptr, repptr, repsize);
3270 *outptr += repsize;
3271 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003272
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273 /* we made it! */
3274 res = 0;
3275
Benjamin Peterson29060642009-01-31 22:14:21 +00003276 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 Py_XDECREF(restuple);
3278 return res;
3279}
3280
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003281/* --- UTF-7 Codec -------------------------------------------------------- */
3282
Antoine Pitrou244651a2009-05-04 18:56:13 +00003283/* See RFC2152 for details. We encode conservatively and decode liberally. */
3284
3285/* Three simple macros defining base-64. */
3286
3287/* Is c a base-64 character? */
3288
3289#define IS_BASE64(c) \
3290 (((c) >= 'A' && (c) <= 'Z') || \
3291 ((c) >= 'a' && (c) <= 'z') || \
3292 ((c) >= '0' && (c) <= '9') || \
3293 (c) == '+' || (c) == '/')
3294
3295/* given that c is a base-64 character, what is its base-64 value? */
3296
3297#define FROM_BASE64(c) \
3298 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3299 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3300 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3301 (c) == '+' ? 62 : 63)
3302
3303/* What is the base-64 character of the bottom 6 bits of n? */
3304
3305#define TO_BASE64(n) \
3306 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3307
3308/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3309 * decoded as itself. We are permissive on decoding; the only ASCII
3310 * byte not decoding to itself is the + which begins a base64
3311 * string. */
3312
3313#define DECODE_DIRECT(c) \
3314 ((c) <= 127 && (c) != '+')
3315
3316/* The UTF-7 encoder treats ASCII characters differently according to
3317 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3318 * the above). See RFC2152. This array identifies these different
3319 * sets:
3320 * 0 : "Set D"
3321 * alphanumeric and '(),-./:?
3322 * 1 : "Set O"
3323 * !"#$%&*;<=>@[]^_`{|}
3324 * 2 : "whitespace"
3325 * ht nl cr sp
3326 * 3 : special (must be base64 encoded)
3327 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3328 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003329
Tim Petersced69f82003-09-16 20:30:58 +00003330static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003331char utf7_category[128] = {
3332/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3333 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3334/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3335 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3336/* sp ! " # $ % & ' ( ) * + , - . / */
3337 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3338/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3339 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3340/* @ A B C D E F G H I J K L M N O */
3341 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3342/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3343 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3344/* ` a b c d e f g h i j k l m n o */
3345 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3346/* p q r s t u v w x y z { | } ~ del */
3347 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003348};
3349
Antoine Pitrou244651a2009-05-04 18:56:13 +00003350/* ENCODE_DIRECT: this character should be encoded as itself. The
3351 * answer depends on whether we are encoding set O as itself, and also
3352 * on whether we are encoding whitespace as itself. RFC2152 makes it
3353 * clear that the answers to these questions vary between
3354 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003355
Antoine Pitrou244651a2009-05-04 18:56:13 +00003356#define ENCODE_DIRECT(c, directO, directWS) \
3357 ((c) < 128 && (c) > 0 && \
3358 ((utf7_category[(c)] == 0) || \
3359 (directWS && (utf7_category[(c)] == 2)) || \
3360 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003361
Alexander Belopolsky40018472011-02-26 01:02:56 +00003362PyObject *
3363PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003364 Py_ssize_t size,
3365 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003366{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003367 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3368}
3369
Antoine Pitrou244651a2009-05-04 18:56:13 +00003370/* The decoder. The only state we preserve is our read position,
3371 * i.e. how many characters we have consumed. So if we end in the
3372 * middle of a shift sequence we have to back off the read position
3373 * and the output to the beginning of the sequence, otherwise we lose
3374 * all the shift state (seen bits, number of bits seen, high
3375 * surrogate). */
3376
Alexander Belopolsky40018472011-02-26 01:02:56 +00003377PyObject *
3378PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003379 Py_ssize_t size,
3380 const char *errors,
3381 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003382{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003383 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003384 Py_ssize_t startinpos;
3385 Py_ssize_t endinpos;
3386 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003387 const char *e;
3388 PyUnicodeObject *unicode;
3389 Py_UNICODE *p;
3390 const char *errmsg = "";
3391 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003392 Py_UNICODE *shiftOutStart;
3393 unsigned int base64bits = 0;
3394 unsigned long base64buffer = 0;
3395 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 PyObject *errorHandler = NULL;
3397 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003398
3399 unicode = _PyUnicode_New(size);
3400 if (!unicode)
3401 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003402 if (size == 0) {
3403 if (consumed)
3404 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003405 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003406 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003409 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003410 e = s + size;
3411
3412 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003414 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003415 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003416
Antoine Pitrou244651a2009-05-04 18:56:13 +00003417 if (inShift) { /* in a base-64 section */
3418 if (IS_BASE64(ch)) { /* consume a base-64 character */
3419 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3420 base64bits += 6;
3421 s++;
3422 if (base64bits >= 16) {
3423 /* we have enough bits for a UTF-16 value */
3424 Py_UNICODE outCh = (Py_UNICODE)
3425 (base64buffer >> (base64bits-16));
3426 base64bits -= 16;
3427 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3428 if (surrogate) {
3429 /* expecting a second surrogate */
3430 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3431#ifdef Py_UNICODE_WIDE
3432 *p++ = (((surrogate & 0x3FF)<<10)
3433 | (outCh & 0x3FF)) + 0x10000;
3434#else
3435 *p++ = surrogate;
3436 *p++ = outCh;
3437#endif
3438 surrogate = 0;
3439 }
3440 else {
3441 surrogate = 0;
3442 errmsg = "second surrogate missing";
3443 goto utf7Error;
3444 }
3445 }
3446 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3447 /* first surrogate */
3448 surrogate = outCh;
3449 }
3450 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3451 errmsg = "unexpected second surrogate";
3452 goto utf7Error;
3453 }
3454 else {
3455 *p++ = outCh;
3456 }
3457 }
3458 }
3459 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003460 inShift = 0;
3461 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003462 if (surrogate) {
3463 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003464 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003465 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003466 if (base64bits > 0) { /* left-over bits */
3467 if (base64bits >= 6) {
3468 /* We've seen at least one base-64 character */
3469 errmsg = "partial character in shift sequence";
3470 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003471 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003472 else {
3473 /* Some bits remain; they should be zero */
3474 if (base64buffer != 0) {
3475 errmsg = "non-zero padding bits in shift sequence";
3476 goto utf7Error;
3477 }
3478 }
3479 }
3480 if (ch != '-') {
3481 /* '-' is absorbed; other terminating
3482 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003483 *p++ = ch;
3484 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003485 }
3486 }
3487 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003488 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489 s++; /* consume '+' */
3490 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003491 s++;
3492 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003493 }
3494 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003495 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003496 shiftOutStart = p;
3497 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003498 }
3499 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003500 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003501 *p++ = ch;
3502 s++;
3503 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003504 else {
3505 startinpos = s-starts;
3506 s++;
3507 errmsg = "unexpected special character";
3508 goto utf7Error;
3509 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003510 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003511utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512 outpos = p-PyUnicode_AS_UNICODE(unicode);
3513 endinpos = s-starts;
3514 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003515 errors, &errorHandler,
3516 "utf7", errmsg,
3517 &starts, &e, &startinpos, &endinpos, &exc, &s,
3518 &unicode, &outpos, &p))
3519 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003520 }
3521
Antoine Pitrou244651a2009-05-04 18:56:13 +00003522 /* end of string */
3523
3524 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3525 /* if we're in an inconsistent state, that's an error */
3526 if (surrogate ||
3527 (base64bits >= 6) ||
3528 (base64bits > 0 && base64buffer != 0)) {
3529 outpos = p-PyUnicode_AS_UNICODE(unicode);
3530 endinpos = size;
3531 if (unicode_decode_call_errorhandler(
3532 errors, &errorHandler,
3533 "utf7", "unterminated shift sequence",
3534 &starts, &e, &startinpos, &endinpos, &exc, &s,
3535 &unicode, &outpos, &p))
3536 goto onError;
3537 if (s < e)
3538 goto restart;
3539 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003540 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003541
3542 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003543 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003544 if (inShift) {
3545 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003546 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003547 }
3548 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003549 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003550 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003551 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003552
Victor Stinnerfe226c02011-10-03 03:52:20 +02003553 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554 goto onError;
3555
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003556 Py_XDECREF(errorHandler);
3557 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003558 if (PyUnicode_READY(unicode) == -1) {
3559 Py_DECREF(unicode);
3560 return NULL;
3561 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003562 return (PyObject *)unicode;
3563
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_XDECREF(errorHandler);
3566 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567 Py_DECREF(unicode);
3568 return NULL;
3569}
3570
3571
Alexander Belopolsky40018472011-02-26 01:02:56 +00003572PyObject *
3573PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003574 Py_ssize_t size,
3575 int base64SetO,
3576 int base64WhiteSpace,
3577 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003578{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003579 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003580 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003581 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003582 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003583 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003584 unsigned int base64bits = 0;
3585 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003586 char * out;
3587 char * start;
3588
3589 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003591
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003592 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003593 return PyErr_NoMemory();
3594
Antoine Pitrou244651a2009-05-04 18:56:13 +00003595 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596 if (v == NULL)
3597 return NULL;
3598
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003599 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003600 for (;i < size; ++i) {
3601 Py_UNICODE ch = s[i];
3602
Antoine Pitrou244651a2009-05-04 18:56:13 +00003603 if (inShift) {
3604 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3605 /* shifting out */
3606 if (base64bits) { /* output remaining bits */
3607 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3608 base64buffer = 0;
3609 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003610 }
3611 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003612 /* Characters not in the BASE64 set implicitly unshift the sequence
3613 so no '-' is required, except if the character is itself a '-' */
3614 if (IS_BASE64(ch) || ch == '-') {
3615 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003617 *out++ = (char) ch;
3618 }
3619 else {
3620 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003621 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003622 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623 else { /* not in a shift sequence */
3624 if (ch == '+') {
3625 *out++ = '+';
3626 *out++ = '-';
3627 }
3628 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3629 *out++ = (char) ch;
3630 }
3631 else {
3632 *out++ = '+';
3633 inShift = 1;
3634 goto encode_char;
3635 }
3636 }
3637 continue;
3638encode_char:
3639#ifdef Py_UNICODE_WIDE
3640 if (ch >= 0x10000) {
3641 /* code first surrogate */
3642 base64bits += 16;
3643 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3644 while (base64bits >= 6) {
3645 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3646 base64bits -= 6;
3647 }
3648 /* prepare second surrogate */
3649 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3650 }
3651#endif
3652 base64bits += 16;
3653 base64buffer = (base64buffer << 16) | ch;
3654 while (base64bits >= 6) {
3655 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3656 base64bits -= 6;
3657 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003658 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003659 if (base64bits)
3660 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3661 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003662 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003663 if (_PyBytes_Resize(&v, out - start) < 0)
3664 return NULL;
3665 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003666}
3667
Antoine Pitrou244651a2009-05-04 18:56:13 +00003668#undef IS_BASE64
3669#undef FROM_BASE64
3670#undef TO_BASE64
3671#undef DECODE_DIRECT
3672#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003673
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674/* --- UTF-8 Codec -------------------------------------------------------- */
3675
Tim Petersced69f82003-09-16 20:30:58 +00003676static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003678 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3679 illegal prefix. See RFC 3629 for details */
3680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3685 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003687 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3692 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3693 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3694 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3695 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696};
3697
Alexander Belopolsky40018472011-02-26 01:02:56 +00003698PyObject *
3699PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003700 Py_ssize_t size,
3701 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702{
Walter Dörwald69652032004-09-07 20:24:22 +00003703 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3704}
3705
Antoine Pitrouab868312009-01-10 15:40:25 +00003706/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3707#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3708
3709/* Mask to quickly check whether a C 'long' contains a
3710 non-ASCII, UTF8-encoded char. */
3711#if (SIZEOF_LONG == 8)
3712# define ASCII_CHAR_MASK 0x8080808080808080L
3713#elif (SIZEOF_LONG == 4)
3714# define ASCII_CHAR_MASK 0x80808080L
3715#else
3716# error C 'long' size should be either 4 or 8!
3717#endif
3718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719/* Scans a UTF-8 string and returns the maximum character to be expected,
3720 the size of the decoded unicode string and if any major errors were
3721 encountered.
3722
3723 This function does check basic UTF-8 sanity, it does however NOT CHECK
3724 if the string contains surrogates, and if all continuation bytes are
3725 within the correct ranges, these checks are performed in
3726 PyUnicode_DecodeUTF8Stateful.
3727
3728 If it sets has_errors to 1, it means the value of unicode_size and max_char
3729 will be bogus and you should not rely on useful information in them.
3730 */
3731static Py_UCS4
3732utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3733 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3734 int *has_errors)
3735{
3736 Py_ssize_t n;
3737 Py_ssize_t char_count = 0;
3738 Py_UCS4 max_char = 127, new_max;
3739 Py_UCS4 upper_bound;
3740 const unsigned char *p = (const unsigned char *)s;
3741 const unsigned char *end = p + string_size;
3742 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3743 int err = 0;
3744
3745 for (; p < end && !err; ++p, ++char_count) {
3746 /* Only check value if it's not a ASCII char... */
3747 if (*p < 0x80) {
3748 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3749 an explanation. */
3750 if (!((size_t) p & LONG_PTR_MASK)) {
3751 /* Help register allocation */
3752 register const unsigned char *_p = p;
3753 while (_p < aligned_end) {
3754 unsigned long value = *(unsigned long *) _p;
3755 if (value & ASCII_CHAR_MASK)
3756 break;
3757 _p += SIZEOF_LONG;
3758 char_count += SIZEOF_LONG;
3759 }
3760 p = _p;
3761 if (p == end)
3762 break;
3763 }
3764 }
3765 if (*p >= 0x80) {
3766 n = utf8_code_length[*p];
3767 new_max = max_char;
3768 switch (n) {
3769 /* invalid start byte */
3770 case 0:
3771 err = 1;
3772 break;
3773 case 2:
3774 /* Code points between 0x00FF and 0x07FF inclusive.
3775 Approximate the upper bound of the code point,
3776 if this flips over 255 we can be sure it will be more
3777 than 255 and the string will need 2 bytes per code coint,
3778 if it stays under or equal to 255, we can be sure 1 byte
3779 is enough.
3780 ((*p & 0b00011111) << 6) | 0b00111111 */
3781 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3782 if (max_char < upper_bound)
3783 new_max = upper_bound;
3784 /* Ensure we track at least that we left ASCII space. */
3785 if (new_max < 128)
3786 new_max = 128;
3787 break;
3788 case 3:
3789 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3790 always > 255 and <= 65535 and will always need 2 bytes. */
3791 if (max_char < 65535)
3792 new_max = 65535;
3793 break;
3794 case 4:
3795 /* Code point will be above 0xFFFF for sure in this case. */
3796 new_max = 65537;
3797 break;
3798 /* Internal error, this should be caught by the first if */
3799 case 1:
3800 default:
3801 assert(0 && "Impossible case in utf8_max_char_and_size");
3802 err = 1;
3803 }
3804 /* Instead of number of overall bytes for this code point,
3805 n containts the number of following bytes: */
3806 --n;
3807 /* Check if the follow up chars are all valid continuation bytes */
3808 if (n >= 1) {
3809 const unsigned char *cont;
3810 if ((p + n) >= end) {
3811 if (consumed == 0)
3812 /* incomplete data, non-incremental decoding */
3813 err = 1;
3814 break;
3815 }
3816 for (cont = p + 1; cont < (p + n); ++cont) {
3817 if ((*cont & 0xc0) != 0x80) {
3818 err = 1;
3819 break;
3820 }
3821 }
3822 p += n;
3823 }
3824 else
3825 err = 1;
3826 max_char = new_max;
3827 }
3828 }
3829
3830 if (unicode_size)
3831 *unicode_size = char_count;
3832 if (has_errors)
3833 *has_errors = err;
3834 return max_char;
3835}
3836
3837/* Similar to PyUnicode_WRITE but can also write into wstr field
3838 of the legacy unicode representation */
3839#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3840 do { \
3841 const int k_ = (kind); \
3842 if (k_ == PyUnicode_WCHAR_KIND) \
3843 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3844 else if (k_ == PyUnicode_1BYTE_KIND) \
3845 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3846 else if (k_ == PyUnicode_2BYTE_KIND) \
3847 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3848 else \
3849 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3850 } while (0)
3851
Alexander Belopolsky40018472011-02-26 01:02:56 +00003852PyObject *
3853PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 Py_ssize_t size,
3855 const char *errors,
3856 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003860 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003861 Py_ssize_t startinpos;
3862 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003863 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003865 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 PyObject *errorHandler = NULL;
3867 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 Py_UCS4 maxchar = 0;
3869 Py_ssize_t unicode_size;
3870 Py_ssize_t i;
3871 int kind;
3872 void *data;
3873 int has_errors;
3874 Py_UNICODE *error_outptr;
3875#if SIZEOF_WCHAR_T == 2
3876 Py_ssize_t wchar_offset = 0;
3877#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878
Walter Dörwald69652032004-09-07 20:24:22 +00003879 if (size == 0) {
3880 if (consumed)
3881 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3885 consumed, &has_errors);
3886 if (has_errors) {
3887 unicode = _PyUnicode_New(size);
3888 if (!unicode)
3889 return NULL;
3890 kind = PyUnicode_WCHAR_KIND;
3891 data = PyUnicode_AS_UNICODE(unicode);
3892 assert(data != NULL);
3893 }
3894 else {
3895 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3896 if (!unicode)
3897 return NULL;
3898 /* When the string is ASCII only, just use memcpy and return.
3899 unicode_size may be != size if there is an incomplete UTF-8
3900 sequence at the end of the ASCII block. */
3901 if (maxchar < 128 && size == unicode_size) {
3902 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3903 return (PyObject *)unicode;
3904 }
3905 kind = PyUnicode_KIND(unicode);
3906 data = PyUnicode_DATA(unicode);
3907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003911 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003912
3913 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003914 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003915
3916 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003917 /* Fast path for runs of ASCII characters. Given that common UTF-8
3918 input will consist of an overwhelming majority of ASCII
3919 characters, we try to optimize for this case by checking
3920 as many characters as a C 'long' can contain.
3921 First, check if we can do an aligned read, as most CPUs have
3922 a penalty for unaligned reads.
3923 */
3924 if (!((size_t) s & LONG_PTR_MASK)) {
3925 /* Help register allocation */
3926 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003928 while (_s < aligned_end) {
3929 /* Read a whole long at a time (either 4 or 8 bytes),
3930 and do a fast unrolled copy if it only contains ASCII
3931 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932 unsigned long value = *(unsigned long *) _s;
3933 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003934 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3936 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3937 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3938 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003939#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3941 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3942 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3943 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003944#endif
3945 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003947 }
3948 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003950 if (s == e)
3951 break;
3952 ch = (unsigned char)*s;
3953 }
3954 }
3955
3956 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 s++;
3959 continue;
3960 }
3961
3962 n = utf8_code_length[ch];
3963
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003964 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 if (consumed)
3966 break;
3967 else {
3968 errmsg = "unexpected end of data";
3969 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003970 endinpos = startinpos+1;
3971 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3972 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003973 goto utf8Error;
3974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976
3977 switch (n) {
3978
3979 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003980 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 startinpos = s-starts;
3982 endinpos = startinpos+1;
3983 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
3985 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003986 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003987 startinpos = s-starts;
3988 endinpos = startinpos+1;
3989 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990
3991 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003992 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003993 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003995 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 goto utf8Error;
3997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003999 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 break;
4002
4003 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004004 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4005 will result in surrogates in range d800-dfff. Surrogates are
4006 not valid UTF-8 so they are rejected.
4007 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4008 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004009 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004010 (s[2] & 0xc0) != 0x80 ||
4011 ((unsigned char)s[0] == 0xE0 &&
4012 (unsigned char)s[1] < 0xA0) ||
4013 ((unsigned char)s[0] == 0xED &&
4014 (unsigned char)s[1] > 0x9F)) {
4015 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004017 endinpos = startinpos + 1;
4018
4019 /* if s[1] first two bits are 1 and 0, then the invalid
4020 continuation byte is s[2], so increment endinpos by 1,
4021 if not, s[1] is invalid and endinpos doesn't need to
4022 be incremented. */
4023 if ((s[1] & 0xC0) == 0x80)
4024 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 goto utf8Error;
4026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004028 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004030 break;
4031
4032 case 4:
4033 if ((s[1] & 0xc0) != 0x80 ||
4034 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004035 (s[3] & 0xc0) != 0x80 ||
4036 ((unsigned char)s[0] == 0xF0 &&
4037 (unsigned char)s[1] < 0x90) ||
4038 ((unsigned char)s[0] == 0xF4 &&
4039 (unsigned char)s[1] > 0x8F)) {
4040 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004042 endinpos = startinpos + 1;
4043 if ((s[1] & 0xC0) == 0x80) {
4044 endinpos++;
4045 if ((s[2] & 0xC0) == 0x80)
4046 endinpos++;
4047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 goto utf8Error;
4049 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004050 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004051 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4052 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004054 /* If the string is flexible or we have native UCS-4, write
4055 directly.. */
4056 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4057 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 else {
4060 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 /* translate from 10000..10FFFF to 0..FFFF */
4063 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 /* high surrogate = top 10 bits added to D800 */
4066 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4067 (Py_UNICODE)(0xD800 + (ch >> 10)));
4068
4069 /* low surrogate = bottom 10 bits added to DC00 */
4070 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4071 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4072 }
4073#if SIZEOF_WCHAR_T == 2
4074 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004075#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 }
4078 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004080
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 /* If this is not yet a resizable string, make it one.. */
4083 if (kind != PyUnicode_WCHAR_KIND) {
4084 const Py_UNICODE *u;
4085 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4086 if (!new_unicode)
4087 goto onError;
4088 u = PyUnicode_AsUnicode((PyObject *)unicode);
4089 if (!u)
4090 goto onError;
4091#if SIZEOF_WCHAR_T == 2
4092 i += wchar_offset;
4093#endif
4094 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4095 Py_DECREF(unicode);
4096 unicode = new_unicode;
4097 kind = 0;
4098 data = PyUnicode_AS_UNICODE(new_unicode);
4099 assert(data != NULL);
4100 }
4101 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 if (unicode_decode_call_errorhandler(
4103 errors, &errorHandler,
4104 "utf8", errmsg,
4105 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004106 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004108 /* Update data because unicode_decode_call_errorhandler might have
4109 re-created or resized the unicode object. */
4110 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 /* Ensure the unicode_size calculation above was correct: */
4114 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4115
Walter Dörwald69652032004-09-07 20:24:22 +00004116 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004117 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 /* Adjust length and ready string when it contained errors and
4120 is of the old resizable kind. */
4121 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004122 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 PyUnicode_READY(unicode) == -1)
4124 goto onError;
4125 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 Py_XDECREF(errorHandler);
4128 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129 if (PyUnicode_READY(unicode) == -1) {
4130 Py_DECREF(unicode);
4131 return NULL;
4132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 return (PyObject *)unicode;
4134
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 Py_XDECREF(errorHandler);
4137 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 Py_DECREF(unicode);
4139 return NULL;
4140}
4141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004142#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004143
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004144#ifdef __APPLE__
4145
4146/* Simplified UTF-8 decoder using surrogateescape error handler,
4147 used to decode the command line arguments on Mac OS X. */
4148
4149wchar_t*
4150_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4151{
4152 int n;
4153 const char *e;
4154 wchar_t *unicode, *p;
4155
4156 /* Note: size will always be longer than the resulting Unicode
4157 character count */
4158 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4159 PyErr_NoMemory();
4160 return NULL;
4161 }
4162 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4163 if (!unicode)
4164 return NULL;
4165
4166 /* Unpack UTF-8 encoded data */
4167 p = unicode;
4168 e = s + size;
4169 while (s < e) {
4170 Py_UCS4 ch = (unsigned char)*s;
4171
4172 if (ch < 0x80) {
4173 *p++ = (wchar_t)ch;
4174 s++;
4175 continue;
4176 }
4177
4178 n = utf8_code_length[ch];
4179 if (s + n > e) {
4180 goto surrogateescape;
4181 }
4182
4183 switch (n) {
4184 case 0:
4185 case 1:
4186 goto surrogateescape;
4187
4188 case 2:
4189 if ((s[1] & 0xc0) != 0x80)
4190 goto surrogateescape;
4191 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4192 assert ((ch > 0x007F) && (ch <= 0x07FF));
4193 *p++ = (wchar_t)ch;
4194 break;
4195
4196 case 3:
4197 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4198 will result in surrogates in range d800-dfff. Surrogates are
4199 not valid UTF-8 so they are rejected.
4200 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4201 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4202 if ((s[1] & 0xc0) != 0x80 ||
4203 (s[2] & 0xc0) != 0x80 ||
4204 ((unsigned char)s[0] == 0xE0 &&
4205 (unsigned char)s[1] < 0xA0) ||
4206 ((unsigned char)s[0] == 0xED &&
4207 (unsigned char)s[1] > 0x9F)) {
4208
4209 goto surrogateescape;
4210 }
4211 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4212 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004214 break;
4215
4216 case 4:
4217 if ((s[1] & 0xc0) != 0x80 ||
4218 (s[2] & 0xc0) != 0x80 ||
4219 (s[3] & 0xc0) != 0x80 ||
4220 ((unsigned char)s[0] == 0xF0 &&
4221 (unsigned char)s[1] < 0x90) ||
4222 ((unsigned char)s[0] == 0xF4 &&
4223 (unsigned char)s[1] > 0x8F)) {
4224 goto surrogateescape;
4225 }
4226 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4227 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4228 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4229
4230#if SIZEOF_WCHAR_T == 4
4231 *p++ = (wchar_t)ch;
4232#else
4233 /* compute and append the two surrogates: */
4234
4235 /* translate from 10000..10FFFF to 0..FFFF */
4236 ch -= 0x10000;
4237
4238 /* high surrogate = top 10 bits added to D800 */
4239 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4240
4241 /* low surrogate = bottom 10 bits added to DC00 */
4242 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4243#endif
4244 break;
4245 }
4246 s += n;
4247 continue;
4248
4249 surrogateescape:
4250 *p++ = 0xDC00 + ch;
4251 s++;
4252 }
4253 *p = L'\0';
4254 return unicode;
4255}
4256
4257#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004259/* Primary internal function which creates utf8 encoded bytes objects.
4260
4261 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004262 and allocate exactly as much space needed at the end. Else allocate the
4263 maximum possible needed (4 result bytes per Unicode character), and return
4264 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004265*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004266PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268{
Tim Peters602f7402002-04-27 18:03:26 +00004269#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004270
Guido van Rossum98297ee2007-11-06 21:34:58 +00004271 Py_ssize_t i; /* index into s of next input byte */
4272 PyObject *result; /* result string object */
4273 char *p; /* next free byte in output buffer */
4274 Py_ssize_t nallocated; /* number of result bytes allocated */
4275 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004276 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004277 PyObject *errorHandler = NULL;
4278 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 int kind;
4280 void *data;
4281 Py_ssize_t size;
4282 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4283#if SIZEOF_WCHAR_T == 2
4284 Py_ssize_t wchar_offset = 0;
4285#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004287 if (!PyUnicode_Check(unicode)) {
4288 PyErr_BadArgument();
4289 return NULL;
4290 }
4291
4292 if (PyUnicode_READY(unicode) == -1)
4293 return NULL;
4294
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004295 if (PyUnicode_UTF8(unicode))
4296 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4297 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004298
4299 kind = PyUnicode_KIND(unicode);
4300 data = PyUnicode_DATA(unicode);
4301 size = PyUnicode_GET_LENGTH(unicode);
4302
Tim Peters602f7402002-04-27 18:03:26 +00004303 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304
Tim Peters602f7402002-04-27 18:03:26 +00004305 if (size <= MAX_SHORT_UNICHARS) {
4306 /* Write into the stack buffer; nallocated can't overflow.
4307 * At the end, we'll allocate exactly as much heap space as it
4308 * turns out we need.
4309 */
4310 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004311 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004312 p = stackbuf;
4313 }
4314 else {
4315 /* Overallocate on the heap, and give the excess back at the end. */
4316 nallocated = size * 4;
4317 if (nallocated / 4 != size) /* overflow! */
4318 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004319 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004320 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004321 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004322 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004323 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004324
Tim Peters602f7402002-04-27 18:03:26 +00004325 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004326 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004327
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004328 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004329 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004331
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004333 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004334 *p++ = (char)(0xc0 | (ch >> 6));
4335 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004336 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004337 Py_ssize_t newpos;
4338 PyObject *rep;
4339 Py_ssize_t repsize, k, startpos;
4340 startpos = i-1;
4341#if SIZEOF_WCHAR_T == 2
4342 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004343#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 rep = unicode_encode_call_errorhandler(
4345 errors, &errorHandler, "utf-8", "surrogates not allowed",
4346 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4347 &exc, startpos, startpos+1, &newpos);
4348 if (!rep)
4349 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004351 if (PyBytes_Check(rep))
4352 repsize = PyBytes_GET_SIZE(rep);
4353 else
4354 repsize = PyUnicode_GET_SIZE(rep);
4355
4356 if (repsize > 4) {
4357 Py_ssize_t offset;
4358
4359 if (result == NULL)
4360 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004361 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4365 /* integer overflow */
4366 PyErr_NoMemory();
4367 goto error;
4368 }
4369 nallocated += repsize - 4;
4370 if (result != NULL) {
4371 if (_PyBytes_Resize(&result, nallocated) < 0)
4372 goto error;
4373 } else {
4374 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004375 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004376 goto error;
4377 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4378 }
4379 p = PyBytes_AS_STRING(result) + offset;
4380 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 if (PyBytes_Check(rep)) {
4383 char *prep = PyBytes_AS_STRING(rep);
4384 for(k = repsize; k > 0; k--)
4385 *p++ = *prep++;
4386 } else /* rep is unicode */ {
4387 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4388 Py_UNICODE c;
4389
4390 for(k=0; k<repsize; k++) {
4391 c = prep[k];
4392 if (0x80 <= c) {
4393 raise_encode_exception(&exc, "utf-8",
4394 PyUnicode_AS_UNICODE(unicode),
4395 size, i-1, i,
4396 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004397 goto error;
4398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004399 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004400 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004402 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004403 } else if (ch < 0x10000) {
4404 *p++ = (char)(0xe0 | (ch >> 12));
4405 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4406 *p++ = (char)(0x80 | (ch & 0x3f));
4407 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004408 /* Encode UCS4 Unicode ordinals */
4409 *p++ = (char)(0xf0 | (ch >> 18));
4410 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4411 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4412 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004413#if SIZEOF_WCHAR_T == 2
4414 wchar_offset++;
4415#endif
Tim Peters602f7402002-04-27 18:03:26 +00004416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004418
Guido van Rossum98297ee2007-11-06 21:34:58 +00004419 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004420 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004421 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004422 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004423 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004424 }
4425 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004426 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004427 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004428 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004429 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004431
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004432 Py_XDECREF(errorHandler);
4433 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004434 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004435 error:
4436 Py_XDECREF(errorHandler);
4437 Py_XDECREF(exc);
4438 Py_XDECREF(result);
4439 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004440
Tim Peters602f7402002-04-27 18:03:26 +00004441#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442}
4443
Alexander Belopolsky40018472011-02-26 01:02:56 +00004444PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004445PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4446 Py_ssize_t size,
4447 const char *errors)
4448{
4449 PyObject *v, *unicode;
4450
4451 unicode = PyUnicode_FromUnicode(s, size);
4452 if (unicode == NULL)
4453 return NULL;
4454 v = _PyUnicode_AsUTF8String(unicode, errors);
4455 Py_DECREF(unicode);
4456 return v;
4457}
4458
4459PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004460PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463}
4464
Walter Dörwald41980ca2007-08-16 21:55:45 +00004465/* --- UTF-32 Codec ------------------------------------------------------- */
4466
4467PyObject *
4468PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 Py_ssize_t size,
4470 const char *errors,
4471 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004472{
4473 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4474}
4475
4476PyObject *
4477PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 Py_ssize_t size,
4479 const char *errors,
4480 int *byteorder,
4481 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004482{
4483 const char *starts = s;
4484 Py_ssize_t startinpos;
4485 Py_ssize_t endinpos;
4486 Py_ssize_t outpos;
4487 PyUnicodeObject *unicode;
4488 Py_UNICODE *p;
4489#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004490 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004491 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004492#else
4493 const int pairs = 0;
4494#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004495 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004496 int bo = 0; /* assume native ordering by default */
4497 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004498 /* Offsets from q for retrieving bytes in the right order. */
4499#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4500 int iorder[] = {0, 1, 2, 3};
4501#else
4502 int iorder[] = {3, 2, 1, 0};
4503#endif
4504 PyObject *errorHandler = NULL;
4505 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004506
Walter Dörwald41980ca2007-08-16 21:55:45 +00004507 q = (unsigned char *)s;
4508 e = q + size;
4509
4510 if (byteorder)
4511 bo = *byteorder;
4512
4513 /* Check for BOM marks (U+FEFF) in the input and adjust current
4514 byte order setting accordingly. In native mode, the leading BOM
4515 mark is skipped, in all other modes, it is copied to the output
4516 stream as-is (giving a ZWNBSP character). */
4517 if (bo == 0) {
4518 if (size >= 4) {
4519 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004521#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 if (bom == 0x0000FEFF) {
4523 q += 4;
4524 bo = -1;
4525 }
4526 else if (bom == 0xFFFE0000) {
4527 q += 4;
4528 bo = 1;
4529 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004530#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 if (bom == 0x0000FEFF) {
4532 q += 4;
4533 bo = 1;
4534 }
4535 else if (bom == 0xFFFE0000) {
4536 q += 4;
4537 bo = -1;
4538 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004539#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004541 }
4542
4543 if (bo == -1) {
4544 /* force LE */
4545 iorder[0] = 0;
4546 iorder[1] = 1;
4547 iorder[2] = 2;
4548 iorder[3] = 3;
4549 }
4550 else if (bo == 1) {
4551 /* force BE */
4552 iorder[0] = 3;
4553 iorder[1] = 2;
4554 iorder[2] = 1;
4555 iorder[3] = 0;
4556 }
4557
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004558 /* On narrow builds we split characters outside the BMP into two
4559 codepoints => count how much extra space we need. */
4560#ifndef Py_UNICODE_WIDE
4561 for (qq = q; qq < e; qq += 4)
4562 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4563 pairs++;
4564#endif
4565
4566 /* This might be one to much, because of a BOM */
4567 unicode = _PyUnicode_New((size+3)/4+pairs);
4568 if (!unicode)
4569 return NULL;
4570 if (size == 0)
4571 return (PyObject *)unicode;
4572
4573 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004574 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004575
Walter Dörwald41980ca2007-08-16 21:55:45 +00004576 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 Py_UCS4 ch;
4578 /* remaining bytes at the end? (size should be divisible by 4) */
4579 if (e-q<4) {
4580 if (consumed)
4581 break;
4582 errmsg = "truncated data";
4583 startinpos = ((const char *)q)-starts;
4584 endinpos = ((const char *)e)-starts;
4585 goto utf32Error;
4586 /* The remaining input chars are ignored if the callback
4587 chooses to skip the input */
4588 }
4589 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4590 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004591
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 if (ch >= 0x110000)
4593 {
4594 errmsg = "codepoint not in range(0x110000)";
4595 startinpos = ((const char *)q)-starts;
4596 endinpos = startinpos+4;
4597 goto utf32Error;
4598 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004599#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 if (ch >= 0x10000)
4601 {
4602 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4603 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4604 }
4605 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004606#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 *p++ = ch;
4608 q += 4;
4609 continue;
4610 utf32Error:
4611 outpos = p-PyUnicode_AS_UNICODE(unicode);
4612 if (unicode_decode_call_errorhandler(
4613 errors, &errorHandler,
4614 "utf32", errmsg,
4615 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4616 &unicode, &outpos, &p))
4617 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004618 }
4619
4620 if (byteorder)
4621 *byteorder = bo;
4622
4623 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004625
4626 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004627 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004628 goto onError;
4629
4630 Py_XDECREF(errorHandler);
4631 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004632 if (PyUnicode_READY(unicode) == -1) {
4633 Py_DECREF(unicode);
4634 return NULL;
4635 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004636 return (PyObject *)unicode;
4637
Benjamin Peterson29060642009-01-31 22:14:21 +00004638 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004639 Py_DECREF(unicode);
4640 Py_XDECREF(errorHandler);
4641 Py_XDECREF(exc);
4642 return NULL;
4643}
4644
4645PyObject *
4646PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 Py_ssize_t size,
4648 const char *errors,
4649 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004650{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004651 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004652 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004653 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004654#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004655 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004656#else
4657 const int pairs = 0;
4658#endif
4659 /* Offsets from p for storing byte pairs in the right order. */
4660#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4661 int iorder[] = {0, 1, 2, 3};
4662#else
4663 int iorder[] = {3, 2, 1, 0};
4664#endif
4665
Benjamin Peterson29060642009-01-31 22:14:21 +00004666#define STORECHAR(CH) \
4667 do { \
4668 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4669 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4670 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4671 p[iorder[0]] = (CH) & 0xff; \
4672 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004673 } while(0)
4674
4675 /* In narrow builds we can output surrogate pairs as one codepoint,
4676 so we need less space. */
4677#ifndef Py_UNICODE_WIDE
4678 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004679 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4680 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4681 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004682#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004683 nsize = (size - pairs + (byteorder == 0));
4684 bytesize = nsize * 4;
4685 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004686 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004687 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004688 if (v == NULL)
4689 return NULL;
4690
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004691 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004692 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004694 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004695 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004696
4697 if (byteorder == -1) {
4698 /* force LE */
4699 iorder[0] = 0;
4700 iorder[1] = 1;
4701 iorder[2] = 2;
4702 iorder[3] = 3;
4703 }
4704 else if (byteorder == 1) {
4705 /* force BE */
4706 iorder[0] = 3;
4707 iorder[1] = 2;
4708 iorder[2] = 1;
4709 iorder[3] = 0;
4710 }
4711
4712 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004714#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4716 Py_UCS4 ch2 = *s;
4717 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4718 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4719 s++;
4720 size--;
4721 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004722 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723#endif
4724 STORECHAR(ch);
4725 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004726
4727 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004728 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004729#undef STORECHAR
4730}
4731
Alexander Belopolsky40018472011-02-26 01:02:56 +00004732PyObject *
4733PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004734{
4735 if (!PyUnicode_Check(unicode)) {
4736 PyErr_BadArgument();
4737 return NULL;
4738 }
4739 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 PyUnicode_GET_SIZE(unicode),
4741 NULL,
4742 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004743}
4744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745/* --- UTF-16 Codec ------------------------------------------------------- */
4746
Tim Peters772747b2001-08-09 22:21:55 +00004747PyObject *
4748PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 Py_ssize_t size,
4750 const char *errors,
4751 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752{
Walter Dörwald69652032004-09-07 20:24:22 +00004753 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4754}
4755
Antoine Pitrouab868312009-01-10 15:40:25 +00004756/* Two masks for fast checking of whether a C 'long' may contain
4757 UTF16-encoded surrogate characters. This is an efficient heuristic,
4758 assuming that non-surrogate characters with a code point >= 0x8000 are
4759 rare in most input.
4760 FAST_CHAR_MASK is used when the input is in native byte ordering,
4761 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004762*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004763#if (SIZEOF_LONG == 8)
4764# define FAST_CHAR_MASK 0x8000800080008000L
4765# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4766#elif (SIZEOF_LONG == 4)
4767# define FAST_CHAR_MASK 0x80008000L
4768# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4769#else
4770# error C 'long' size should be either 4 or 8!
4771#endif
4772
Walter Dörwald69652032004-09-07 20:24:22 +00004773PyObject *
4774PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004775 Py_ssize_t size,
4776 const char *errors,
4777 int *byteorder,
4778 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004781 Py_ssize_t startinpos;
4782 Py_ssize_t endinpos;
4783 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 PyUnicodeObject *unicode;
4785 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004786 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004787 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004788 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004789 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004790 /* Offsets from q for retrieving byte pairs in the right order. */
4791#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4792 int ihi = 1, ilo = 0;
4793#else
4794 int ihi = 0, ilo = 1;
4795#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 PyObject *errorHandler = NULL;
4797 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798
4799 /* Note: size will always be longer than the resulting Unicode
4800 character count */
4801 unicode = _PyUnicode_New(size);
4802 if (!unicode)
4803 return NULL;
4804 if (size == 0)
4805 return (PyObject *)unicode;
4806
4807 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004809 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004810 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811
4812 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004813 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004815 /* Check for BOM marks (U+FEFF) in the input and adjust current
4816 byte order setting accordingly. In native mode, the leading BOM
4817 mark is skipped, in all other modes, it is copied to the output
4818 stream as-is (giving a ZWNBSP character). */
4819 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004820 if (size >= 2) {
4821 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004822#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004823 if (bom == 0xFEFF) {
4824 q += 2;
4825 bo = -1;
4826 }
4827 else if (bom == 0xFFFE) {
4828 q += 2;
4829 bo = 1;
4830 }
Tim Petersced69f82003-09-16 20:30:58 +00004831#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 if (bom == 0xFEFF) {
4833 q += 2;
4834 bo = 1;
4835 }
4836 else if (bom == 0xFFFE) {
4837 q += 2;
4838 bo = -1;
4839 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004840#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Tim Peters772747b2001-08-09 22:21:55 +00004844 if (bo == -1) {
4845 /* force LE */
4846 ihi = 1;
4847 ilo = 0;
4848 }
4849 else if (bo == 1) {
4850 /* force BE */
4851 ihi = 0;
4852 ilo = 1;
4853 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004854#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4855 native_ordering = ilo < ihi;
4856#else
4857 native_ordering = ilo > ihi;
4858#endif
Tim Peters772747b2001-08-09 22:21:55 +00004859
Antoine Pitrouab868312009-01-10 15:40:25 +00004860 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004861 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004863 /* First check for possible aligned read of a C 'long'. Unaligned
4864 reads are more expensive, better to defer to another iteration. */
4865 if (!((size_t) q & LONG_PTR_MASK)) {
4866 /* Fast path for runs of non-surrogate chars. */
4867 register const unsigned char *_q = q;
4868 Py_UNICODE *_p = p;
4869 if (native_ordering) {
4870 /* Native ordering is simple: as long as the input cannot
4871 possibly contain a surrogate char, do an unrolled copy
4872 of several 16-bit code points to the target object.
4873 The non-surrogate check is done on several input bytes
4874 at a time (as many as a C 'long' can contain). */
4875 while (_q < aligned_end) {
4876 unsigned long data = * (unsigned long *) _q;
4877 if (data & FAST_CHAR_MASK)
4878 break;
4879 _p[0] = ((unsigned short *) _q)[0];
4880 _p[1] = ((unsigned short *) _q)[1];
4881#if (SIZEOF_LONG == 8)
4882 _p[2] = ((unsigned short *) _q)[2];
4883 _p[3] = ((unsigned short *) _q)[3];
4884#endif
4885 _q += SIZEOF_LONG;
4886 _p += SIZEOF_LONG / 2;
4887 }
4888 }
4889 else {
4890 /* Byteswapped ordering is similar, but we must decompose
4891 the copy bytewise, and take care of zero'ing out the
4892 upper bytes if the target object is in 32-bit units
4893 (that is, in UCS-4 builds). */
4894 while (_q < aligned_end) {
4895 unsigned long data = * (unsigned long *) _q;
4896 if (data & SWAPPED_FAST_CHAR_MASK)
4897 break;
4898 /* Zero upper bytes in UCS-4 builds */
4899#if (Py_UNICODE_SIZE > 2)
4900 _p[0] = 0;
4901 _p[1] = 0;
4902#if (SIZEOF_LONG == 8)
4903 _p[2] = 0;
4904 _p[3] = 0;
4905#endif
4906#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004907 /* Issue #4916; UCS-4 builds on big endian machines must
4908 fill the two last bytes of each 4-byte unit. */
4909#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4910# define OFF 2
4911#else
4912# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004913#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004914 ((unsigned char *) _p)[OFF + 1] = _q[0];
4915 ((unsigned char *) _p)[OFF + 0] = _q[1];
4916 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4917 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4918#if (SIZEOF_LONG == 8)
4919 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4920 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4921 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4922 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4923#endif
4924#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004925 _q += SIZEOF_LONG;
4926 _p += SIZEOF_LONG / 2;
4927 }
4928 }
4929 p = _p;
4930 q = _q;
4931 if (q >= e)
4932 break;
4933 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004935
Benjamin Peterson14339b62009-01-31 16:36:08 +00004936 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004937
4938 if (ch < 0xD800 || ch > 0xDFFF) {
4939 *p++ = ch;
4940 continue;
4941 }
4942
4943 /* UTF-16 code pair: */
4944 if (q > e) {
4945 errmsg = "unexpected end of data";
4946 startinpos = (((const char *)q) - 2) - starts;
4947 endinpos = ((const char *)e) + 1 - starts;
4948 goto utf16Error;
4949 }
4950 if (0xD800 <= ch && ch <= 0xDBFF) {
4951 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4952 q += 2;
4953 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004954#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 *p++ = ch;
4956 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004957#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004959#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 continue;
4961 }
4962 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004963 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 startinpos = (((const char *)q)-4)-starts;
4965 endinpos = startinpos+2;
4966 goto utf16Error;
4967 }
4968
Benjamin Peterson14339b62009-01-31 16:36:08 +00004969 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 errmsg = "illegal encoding";
4971 startinpos = (((const char *)q)-2)-starts;
4972 endinpos = startinpos+2;
4973 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004974
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 utf16Error:
4976 outpos = p - PyUnicode_AS_UNICODE(unicode);
4977 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004978 errors,
4979 &errorHandler,
4980 "utf16", errmsg,
4981 &starts,
4982 (const char **)&e,
4983 &startinpos,
4984 &endinpos,
4985 &exc,
4986 (const char **)&q,
4987 &unicode,
4988 &outpos,
4989 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004992 /* remaining byte at the end? (size should be even) */
4993 if (e == q) {
4994 if (!consumed) {
4995 errmsg = "truncated data";
4996 startinpos = ((const char *)q) - starts;
4997 endinpos = ((const char *)e) + 1 - starts;
4998 outpos = p - PyUnicode_AS_UNICODE(unicode);
4999 if (unicode_decode_call_errorhandler(
5000 errors,
5001 &errorHandler,
5002 "utf16", errmsg,
5003 &starts,
5004 (const char **)&e,
5005 &startinpos,
5006 &endinpos,
5007 &exc,
5008 (const char **)&q,
5009 &unicode,
5010 &outpos,
5011 &p))
5012 goto onError;
5013 /* The remaining input chars are ignored if the callback
5014 chooses to skip the input */
5015 }
5016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017
5018 if (byteorder)
5019 *byteorder = bo;
5020
Walter Dörwald69652032004-09-07 20:24:22 +00005021 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005023
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005025 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026 goto onError;
5027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005028 Py_XDECREF(errorHandler);
5029 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005030 if (PyUnicode_READY(unicode) == -1) {
5031 Py_DECREF(unicode);
5032 return NULL;
5033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 return (PyObject *)unicode;
5035
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005038 Py_XDECREF(errorHandler);
5039 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 return NULL;
5041}
5042
Antoine Pitrouab868312009-01-10 15:40:25 +00005043#undef FAST_CHAR_MASK
5044#undef SWAPPED_FAST_CHAR_MASK
5045
Tim Peters772747b2001-08-09 22:21:55 +00005046PyObject *
5047PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 Py_ssize_t size,
5049 const char *errors,
5050 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005052 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005053 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005054 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005055#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005056 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005057#else
5058 const int pairs = 0;
5059#endif
Tim Peters772747b2001-08-09 22:21:55 +00005060 /* Offsets from p for storing byte pairs in the right order. */
5061#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5062 int ihi = 1, ilo = 0;
5063#else
5064 int ihi = 0, ilo = 1;
5065#endif
5066
Benjamin Peterson29060642009-01-31 22:14:21 +00005067#define STORECHAR(CH) \
5068 do { \
5069 p[ihi] = ((CH) >> 8) & 0xff; \
5070 p[ilo] = (CH) & 0xff; \
5071 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005072 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005074#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005075 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 if (s[i] >= 0x10000)
5077 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005078#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005079 /* 2 * (size + pairs + (byteorder == 0)) */
5080 if (size > PY_SSIZE_T_MAX ||
5081 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005083 nsize = size + pairs + (byteorder == 0);
5084 bytesize = nsize * 2;
5085 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005087 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 if (v == NULL)
5089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005091 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005094 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005095 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005096
5097 if (byteorder == -1) {
5098 /* force LE */
5099 ihi = 1;
5100 ilo = 0;
5101 }
5102 else if (byteorder == 1) {
5103 /* force BE */
5104 ihi = 0;
5105 ilo = 1;
5106 }
5107
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005108 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 Py_UNICODE ch = *s++;
5110 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005111#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 if (ch >= 0x10000) {
5113 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5114 ch = 0xD800 | ((ch-0x10000) >> 10);
5115 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005116#endif
Tim Peters772747b2001-08-09 22:21:55 +00005117 STORECHAR(ch);
5118 if (ch2)
5119 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005120 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005121
5122 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005123 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005124#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125}
5126
Alexander Belopolsky40018472011-02-26 01:02:56 +00005127PyObject *
5128PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129{
5130 if (!PyUnicode_Check(unicode)) {
5131 PyErr_BadArgument();
5132 return NULL;
5133 }
5134 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 PyUnicode_GET_SIZE(unicode),
5136 NULL,
5137 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138}
5139
5140/* --- Unicode Escape Codec ----------------------------------------------- */
5141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005142/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5143 if all the escapes in the string make it still a valid ASCII string.
5144 Returns -1 if any escapes were found which cause the string to
5145 pop out of ASCII range. Otherwise returns the length of the
5146 required buffer to hold the string.
5147 */
5148Py_ssize_t
5149length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5150{
5151 const unsigned char *p = (const unsigned char *)s;
5152 const unsigned char *end = p + size;
5153 Py_ssize_t length = 0;
5154
5155 if (size < 0)
5156 return -1;
5157
5158 for (; p < end; ++p) {
5159 if (*p > 127) {
5160 /* Non-ASCII */
5161 return -1;
5162 }
5163 else if (*p != '\\') {
5164 /* Normal character */
5165 ++length;
5166 }
5167 else {
5168 /* Backslash-escape, check next char */
5169 ++p;
5170 /* Escape sequence reaches till end of string or
5171 non-ASCII follow-up. */
5172 if (p >= end || *p > 127)
5173 return -1;
5174 switch (*p) {
5175 case '\n':
5176 /* backslash + \n result in zero characters */
5177 break;
5178 case '\\': case '\'': case '\"':
5179 case 'b': case 'f': case 't':
5180 case 'n': case 'r': case 'v': case 'a':
5181 ++length;
5182 break;
5183 case '0': case '1': case '2': case '3':
5184 case '4': case '5': case '6': case '7':
5185 case 'x': case 'u': case 'U': case 'N':
5186 /* these do not guarantee ASCII characters */
5187 return -1;
5188 default:
5189 /* count the backslash + the other character */
5190 length += 2;
5191 }
5192 }
5193 }
5194 return length;
5195}
5196
5197/* Similar to PyUnicode_WRITE but either write into wstr field
5198 or treat string as ASCII. */
5199#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5200 do { \
5201 if ((kind) != PyUnicode_WCHAR_KIND) \
5202 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5203 else \
5204 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5205 } while (0)
5206
5207#define WRITE_WSTR(buf, index, value) \
5208 assert(kind == PyUnicode_WCHAR_KIND), \
5209 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5210
5211
Fredrik Lundh06d12682001-01-24 07:59:11 +00005212static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005213
Alexander Belopolsky40018472011-02-26 01:02:56 +00005214PyObject *
5215PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005216 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005217 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005219 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005220 Py_ssize_t startinpos;
5221 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005222 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005224 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005226 char* message;
5227 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228 PyObject *errorHandler = NULL;
5229 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005230 Py_ssize_t ascii_length;
5231 Py_ssize_t i;
5232 int kind;
5233 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005235 ascii_length = length_of_escaped_ascii_string(s, size);
5236
5237 /* After length_of_escaped_ascii_string() there are two alternatives,
5238 either the string is pure ASCII with named escapes like \n, etc.
5239 and we determined it's exact size (common case)
5240 or it contains \x, \u, ... escape sequences. then we create a
5241 legacy wchar string and resize it at the end of this function. */
5242 if (ascii_length >= 0) {
5243 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5244 if (!v)
5245 goto onError;
5246 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5247 kind = PyUnicode_1BYTE_KIND;
5248 data = PyUnicode_DATA(v);
5249 }
5250 else {
5251 /* Escaped strings will always be longer than the resulting
5252 Unicode string, so we start with size here and then reduce the
5253 length after conversion to the true value.
5254 (but if the error callback returns a long replacement string
5255 we'll have to allocate more space) */
5256 v = _PyUnicode_New(size);
5257 if (!v)
5258 goto onError;
5259 kind = PyUnicode_WCHAR_KIND;
5260 data = PyUnicode_AS_UNICODE(v);
5261 }
5262
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 if (size == 0)
5264 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005265 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005267
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 while (s < end) {
5269 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005270 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005271 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005273 if (kind == PyUnicode_WCHAR_KIND) {
5274 assert(i < _PyUnicode_WSTR_LENGTH(v));
5275 }
5276 else {
5277 /* The only case in which i == ascii_length is a backslash
5278 followed by a newline. */
5279 assert(i <= ascii_length);
5280 }
5281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 /* Non-escape characters are interpreted as Unicode ordinals */
5283 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005284 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 continue;
5286 }
5287
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005288 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 /* \ - Escapes */
5290 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005291 c = *s++;
5292 if (s > end)
5293 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005294
5295 if (kind == PyUnicode_WCHAR_KIND) {
5296 assert(i < _PyUnicode_WSTR_LENGTH(v));
5297 }
5298 else {
5299 /* The only case in which i == ascii_length is a backslash
5300 followed by a newline. */
5301 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5302 }
5303
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005304 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005308 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5309 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5310 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5311 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5312 /* FF */
5313 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5314 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5315 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5316 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5317 /* VT */
5318 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5319 /* BEL, not classic C */
5320 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 case '0': case '1': case '2': case '3':
5324 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005325 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005326 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005327 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005328 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005329 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005331 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 break;
5333
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 /* hex escapes */
5335 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005337 digits = 2;
5338 message = "truncated \\xXX escape";
5339 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005343 digits = 4;
5344 message = "truncated \\uXXXX escape";
5345 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005348 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005349 digits = 8;
5350 message = "truncated \\UXXXXXXXX escape";
5351 hexescape:
5352 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005353 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005354 if (s+digits>end) {
5355 endinpos = size;
5356 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 errors, &errorHandler,
5358 "unicodeescape", "end of string in escape sequence",
5359 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005360 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 goto nextByte;
5364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005365 for (j = 0; j < digits; ++j) {
5366 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005367 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005368 endinpos = (s+j+1)-starts;
5369 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 errors, &errorHandler,
5372 "unicodeescape", message,
5373 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005376 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005378 }
5379 chr = (chr<<4) & ~0xF;
5380 if (c >= '0' && c <= '9')
5381 chr += c - '0';
5382 else if (c >= 'a' && c <= 'f')
5383 chr += 10 + c - 'a';
5384 else
5385 chr += 10 + c - 'A';
5386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005387 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005388 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 /* _decoding_error will have already written into the
5390 target buffer. */
5391 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005392 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005393 /* when we get here, chr is a 32-bit unicode character */
5394 if (chr <= 0xffff)
5395 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005397 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005398 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005399 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005400#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005401 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005402#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005403 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5405 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005406#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005407 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005408 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005409 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005410 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 errors, &errorHandler,
5412 "unicodeescape", "illegal Unicode character",
5413 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005414 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005415 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005417 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005418 break;
5419
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005421 case 'N':
5422 message = "malformed \\N character escape";
5423 if (ucnhash_CAPI == NULL) {
5424 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005425 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5426 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005427 if (ucnhash_CAPI == NULL)
5428 goto ucnhashError;
5429 }
5430 if (*s == '{') {
5431 const char *start = s+1;
5432 /* look for the closing brace */
5433 while (*s != '}' && s < end)
5434 s++;
5435 if (s > start && s < end && *s == '}') {
5436 /* found a name. look it up in the unicode database */
5437 message = "unknown Unicode character name";
5438 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5440 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005441 goto store;
5442 }
5443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005445 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 errors, &errorHandler,
5448 "unicodeescape", message,
5449 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005451 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005453 break;
5454
5455 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005456 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005457 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 message = "\\ at end of string";
5459 s--;
5460 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 errors, &errorHandler,
5464 "unicodeescape", message,
5465 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005467 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005469 }
5470 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005471 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5472 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005473 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005474 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005477 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 /* Ensure the length prediction worked in case of ASCII strings */
5480 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5481
Victor Stinnerfe226c02011-10-03 03:52:20 +02005482 if (kind == PyUnicode_WCHAR_KIND)
5483 {
5484 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5485 goto onError;
5486 if (PyUnicode_READY(v) == -1)
5487 goto onError;
5488 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005489 Py_XDECREF(errorHandler);
5490 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005492
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005494 PyErr_SetString(
5495 PyExc_UnicodeError,
5496 "\\N escapes not supported (can't load unicodedata module)"
5497 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005498 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005501 return NULL;
5502
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 return NULL;
5508}
5509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510#undef WRITE_ASCII_OR_WSTR
5511#undef WRITE_WSTR
5512
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513/* Return a Unicode-Escape string version of the Unicode object.
5514
5515 If quotes is true, the string is enclosed in u"" or u'' quotes as
5516 appropriate.
5517
5518*/
5519
Walter Dörwald79e913e2007-05-12 11:08:06 +00005520static const char *hexdigits = "0123456789abcdef";
5521
Alexander Belopolsky40018472011-02-26 01:02:56 +00005522PyObject *
5523PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005524 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005526 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005529#ifdef Py_UNICODE_WIDE
5530 const Py_ssize_t expandsize = 10;
5531#else
5532 const Py_ssize_t expandsize = 6;
5533#endif
5534
Thomas Wouters89f507f2006-12-13 04:49:30 +00005535 /* XXX(nnorwitz): rather than over-allocating, it would be
5536 better to choose a different scheme. Perhaps scan the
5537 first N-chars of the string and allocate based on that size.
5538 */
5539 /* Initial allocation is based on the longest-possible unichr
5540 escape.
5541
5542 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5543 unichr, so in this case it's the longest unichr escape. In
5544 narrow (UTF-16) builds this is five chars per source unichr
5545 since there are two unichrs in the surrogate pair, so in narrow
5546 (UTF-16) builds it's not the longest unichr escape.
5547
5548 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5549 so in the narrow (UTF-16) build case it's the longest unichr
5550 escape.
5551 */
5552
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005553 if (size == 0)
5554 return PyBytes_FromStringAndSize(NULL, 0);
5555
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005556 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005558
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005559 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 2
5561 + expandsize*size
5562 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 if (repr == NULL)
5564 return NULL;
5565
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005566 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 while (size-- > 0) {
5569 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005570
Walter Dörwald79e913e2007-05-12 11:08:06 +00005571 /* Escape backslashes */
5572 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 *p++ = '\\';
5574 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005575 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005576 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005577
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005578#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005579 /* Map 21-bit characters to '\U00xxxxxx' */
5580 else if (ch >= 0x10000) {
5581 *p++ = '\\';
5582 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005583 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5584 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5585 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5586 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5587 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5588 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5589 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5590 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005592 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005593#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5595 else if (ch >= 0xD800 && ch < 0xDC00) {
5596 Py_UNICODE ch2;
5597 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005598
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 ch2 = *s++;
5600 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005601 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5603 *p++ = '\\';
5604 *p++ = 'U';
5605 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5606 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5607 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5608 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5609 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5610 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5611 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5612 *p++ = hexdigits[ucs & 0x0000000F];
5613 continue;
5614 }
5615 /* Fall through: isolated surrogates are copied as-is */
5616 s--;
5617 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005618 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005619#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005620
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005622 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 *p++ = '\\';
5624 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005625 *p++ = hexdigits[(ch >> 12) & 0x000F];
5626 *p++ = hexdigits[(ch >> 8) & 0x000F];
5627 *p++ = hexdigits[(ch >> 4) & 0x000F];
5628 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005630
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005631 /* Map special whitespace to '\t', \n', '\r' */
5632 else if (ch == '\t') {
5633 *p++ = '\\';
5634 *p++ = 't';
5635 }
5636 else if (ch == '\n') {
5637 *p++ = '\\';
5638 *p++ = 'n';
5639 }
5640 else if (ch == '\r') {
5641 *p++ = '\\';
5642 *p++ = 'r';
5643 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005644
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005645 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005646 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005648 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005649 *p++ = hexdigits[(ch >> 4) & 0x000F];
5650 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005651 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 /* Copy everything else as-is */
5654 else
5655 *p++ = (char) ch;
5656 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005658 assert(p - PyBytes_AS_STRING(repr) > 0);
5659 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5660 return NULL;
5661 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662}
5663
Alexander Belopolsky40018472011-02-26 01:02:56 +00005664PyObject *
5665PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005667 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 if (!PyUnicode_Check(unicode)) {
5669 PyErr_BadArgument();
5670 return NULL;
5671 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005672 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5673 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005674 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675}
5676
5677/* --- Raw Unicode Escape Codec ------------------------------------------- */
5678
Alexander Belopolsky40018472011-02-26 01:02:56 +00005679PyObject *
5680PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005681 Py_ssize_t size,
5682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005685 Py_ssize_t startinpos;
5686 Py_ssize_t endinpos;
5687 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 const char *end;
5691 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 PyObject *errorHandler = NULL;
5693 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005694
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 /* Escaped strings will always be longer than the resulting
5696 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 length after conversion to the true value. (But decoding error
5698 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 v = _PyUnicode_New(size);
5700 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 end = s + size;
5706 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 unsigned char c;
5708 Py_UCS4 x;
5709 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005710 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 /* Non-escape characters are interpreted as Unicode ordinals */
5713 if (*s != '\\') {
5714 *p++ = (unsigned char)*s++;
5715 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 startinpos = s-starts;
5718
5719 /* \u-escapes are only interpreted iff the number of leading
5720 backslashes if odd */
5721 bs = s;
5722 for (;s < end;) {
5723 if (*s != '\\')
5724 break;
5725 *p++ = (unsigned char)*s++;
5726 }
5727 if (((s - bs) & 1) == 0 ||
5728 s >= end ||
5729 (*s != 'u' && *s != 'U')) {
5730 continue;
5731 }
5732 p--;
5733 count = *s=='u' ? 4 : 8;
5734 s++;
5735
5736 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5737 outpos = p-PyUnicode_AS_UNICODE(v);
5738 for (x = 0, i = 0; i < count; ++i, ++s) {
5739 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005740 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 endinpos = s-starts;
5742 if (unicode_decode_call_errorhandler(
5743 errors, &errorHandler,
5744 "rawunicodeescape", "truncated \\uXXXX",
5745 &starts, &end, &startinpos, &endinpos, &exc, &s,
5746 &v, &outpos, &p))
5747 goto onError;
5748 goto nextByte;
5749 }
5750 x = (x<<4) & ~0xF;
5751 if (c >= '0' && c <= '9')
5752 x += c - '0';
5753 else if (c >= 'a' && c <= 'f')
5754 x += 10 + c - 'a';
5755 else
5756 x += 10 + c - 'A';
5757 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005758 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 /* UCS-2 character */
5760 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005761 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* UCS-4 character. Either store directly, or as
5763 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005764#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005766#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 x -= 0x10000L;
5768 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5769 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005770#endif
5771 } else {
5772 endinpos = s-starts;
5773 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005774 if (unicode_decode_call_errorhandler(
5775 errors, &errorHandler,
5776 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 &starts, &end, &startinpos, &endinpos, &exc, &s,
5778 &v, &outpos, &p))
5779 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005780 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 nextByte:
5782 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005784 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 Py_XDECREF(errorHandler);
5787 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 if (PyUnicode_READY(v) == -1) {
5789 Py_DECREF(v);
5790 return NULL;
5791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005793
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 Py_XDECREF(errorHandler);
5797 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 return NULL;
5799}
5800
Alexander Belopolsky40018472011-02-26 01:02:56 +00005801PyObject *
5802PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005803 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005805 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 char *p;
5807 char *q;
5808
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005809#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005810 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005811#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005812 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005813#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005814
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005815 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005817
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005818 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 if (repr == NULL)
5820 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005821 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005822 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005824 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 while (size-- > 0) {
5826 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005827#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 /* Map 32-bit characters to '\Uxxxxxxxx' */
5829 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005830 *p++ = '\\';
5831 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005832 *p++ = hexdigits[(ch >> 28) & 0xf];
5833 *p++ = hexdigits[(ch >> 24) & 0xf];
5834 *p++ = hexdigits[(ch >> 20) & 0xf];
5835 *p++ = hexdigits[(ch >> 16) & 0xf];
5836 *p++ = hexdigits[(ch >> 12) & 0xf];
5837 *p++ = hexdigits[(ch >> 8) & 0xf];
5838 *p++ = hexdigits[(ch >> 4) & 0xf];
5839 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005840 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005841 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005842#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5844 if (ch >= 0xD800 && ch < 0xDC00) {
5845 Py_UNICODE ch2;
5846 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005847
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 ch2 = *s++;
5849 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005850 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5852 *p++ = '\\';
5853 *p++ = 'U';
5854 *p++ = hexdigits[(ucs >> 28) & 0xf];
5855 *p++ = hexdigits[(ucs >> 24) & 0xf];
5856 *p++ = hexdigits[(ucs >> 20) & 0xf];
5857 *p++ = hexdigits[(ucs >> 16) & 0xf];
5858 *p++ = hexdigits[(ucs >> 12) & 0xf];
5859 *p++ = hexdigits[(ucs >> 8) & 0xf];
5860 *p++ = hexdigits[(ucs >> 4) & 0xf];
5861 *p++ = hexdigits[ucs & 0xf];
5862 continue;
5863 }
5864 /* Fall through: isolated surrogates are copied as-is */
5865 s--;
5866 size++;
5867 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005868#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 /* Map 16-bit characters to '\uxxxx' */
5870 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 *p++ = '\\';
5872 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005873 *p++ = hexdigits[(ch >> 12) & 0xf];
5874 *p++ = hexdigits[(ch >> 8) & 0xf];
5875 *p++ = hexdigits[(ch >> 4) & 0xf];
5876 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* Copy everything else as-is */
5879 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 *p++ = (char) ch;
5881 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005882 size = p - q;
5883
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005884 assert(size > 0);
5885 if (_PyBytes_Resize(&repr, size) < 0)
5886 return NULL;
5887 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888}
5889
Alexander Belopolsky40018472011-02-26 01:02:56 +00005890PyObject *
5891PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005893 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005895 PyErr_BadArgument();
5896 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005898 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5899 PyUnicode_GET_SIZE(unicode));
5900
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005901 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902}
5903
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005904/* --- Unicode Internal Codec ------------------------------------------- */
5905
Alexander Belopolsky40018472011-02-26 01:02:56 +00005906PyObject *
5907_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005908 Py_ssize_t size,
5909 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005910{
5911 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005912 Py_ssize_t startinpos;
5913 Py_ssize_t endinpos;
5914 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005915 PyUnicodeObject *v;
5916 Py_UNICODE *p;
5917 const char *end;
5918 const char *reason;
5919 PyObject *errorHandler = NULL;
5920 PyObject *exc = NULL;
5921
Neal Norwitzd43069c2006-01-08 01:12:10 +00005922#ifdef Py_UNICODE_WIDE
5923 Py_UNICODE unimax = PyUnicode_GetMax();
5924#endif
5925
Thomas Wouters89f507f2006-12-13 04:49:30 +00005926 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005927 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5928 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5931 as string was created with the old API. */
5932 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005934 p = PyUnicode_AS_UNICODE(v);
5935 end = s + size;
5936
5937 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005938 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005939 /* We have to sanity check the raw data, otherwise doom looms for
5940 some malformed UCS-4 data. */
5941 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005942#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005943 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005944#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005945 end-s < Py_UNICODE_SIZE
5946 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005948 startinpos = s - starts;
5949 if (end-s < Py_UNICODE_SIZE) {
5950 endinpos = end-starts;
5951 reason = "truncated input";
5952 }
5953 else {
5954 endinpos = s - starts + Py_UNICODE_SIZE;
5955 reason = "illegal code point (> 0x10FFFF)";
5956 }
5957 outpos = p - PyUnicode_AS_UNICODE(v);
5958 if (unicode_decode_call_errorhandler(
5959 errors, &errorHandler,
5960 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005961 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005962 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005963 goto onError;
5964 }
5965 }
5966 else {
5967 p++;
5968 s += Py_UNICODE_SIZE;
5969 }
5970 }
5971
Victor Stinnerfe226c02011-10-03 03:52:20 +02005972 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005973 goto onError;
5974 Py_XDECREF(errorHandler);
5975 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005976 if (PyUnicode_READY(v) == -1) {
5977 Py_DECREF(v);
5978 return NULL;
5979 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005980 return (PyObject *)v;
5981
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005983 Py_XDECREF(v);
5984 Py_XDECREF(errorHandler);
5985 Py_XDECREF(exc);
5986 return NULL;
5987}
5988
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989/* --- Latin-1 Codec ------------------------------------------------------ */
5990
Alexander Belopolsky40018472011-02-26 01:02:56 +00005991PyObject *
5992PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005993 Py_ssize_t size,
5994 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005997 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998}
5999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006001static void
6002make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006003 const char *encoding,
6004 const Py_UNICODE *unicode, Py_ssize_t size,
6005 Py_ssize_t startpos, Py_ssize_t endpos,
6006 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 *exceptionObject = PyUnicodeEncodeError_Create(
6010 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 }
6012 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6014 goto onError;
6015 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6016 goto onError;
6017 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6018 goto onError;
6019 return;
6020 onError:
6021 Py_DECREF(*exceptionObject);
6022 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 }
6024}
6025
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006027static void
6028raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006029 const char *encoding,
6030 const Py_UNICODE *unicode, Py_ssize_t size,
6031 Py_ssize_t startpos, Py_ssize_t endpos,
6032 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033{
6034 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038}
6039
6040/* error handling callback helper:
6041 build arguments, call the callback and check the arguments,
6042 put the result into newpos and return the replacement string, which
6043 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044static PyObject *
6045unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006046 PyObject **errorHandler,
6047 const char *encoding, const char *reason,
6048 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6049 Py_ssize_t startpos, Py_ssize_t endpos,
6050 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006052 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053
6054 PyObject *restuple;
6055 PyObject *resunicode;
6056
6057 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 }
6062
6063 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067
6068 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006073 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006074 Py_DECREF(restuple);
6075 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006077 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 &resunicode, newpos)) {
6079 Py_DECREF(restuple);
6080 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006082 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6083 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6084 Py_DECREF(restuple);
6085 return NULL;
6086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006089 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6091 Py_DECREF(restuple);
6092 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006093 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 Py_INCREF(resunicode);
6095 Py_DECREF(restuple);
6096 return resunicode;
6097}
6098
Alexander Belopolsky40018472011-02-26 01:02:56 +00006099static PyObject *
6100unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006101 Py_ssize_t size,
6102 const char *errors,
6103 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104{
6105 /* output object */
6106 PyObject *res;
6107 /* pointers to the beginning and end+1 of input */
6108 const Py_UNICODE *startp = p;
6109 const Py_UNICODE *endp = p + size;
6110 /* pointer to the beginning of the unencodable characters */
6111 /* const Py_UNICODE *badp = NULL; */
6112 /* pointer into the output */
6113 char *str;
6114 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006115 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006116 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6117 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 PyObject *errorHandler = NULL;
6119 PyObject *exc = NULL;
6120 /* the following variable is used for caching string comparisons
6121 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6122 int known_errorHandler = -1;
6123
6124 /* allocate enough for a simple encoding without
6125 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006126 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006127 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006128 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006130 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006131 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 ressize = size;
6133
6134 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 /* can we encode this? */
6138 if (c<limit) {
6139 /* no overflow check, because we know that the space is enough */
6140 *str++ = (char)c;
6141 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 else {
6144 Py_ssize_t unicodepos = p-startp;
6145 Py_ssize_t requiredsize;
6146 PyObject *repunicode;
6147 Py_ssize_t repsize;
6148 Py_ssize_t newpos;
6149 Py_ssize_t respos;
6150 Py_UNICODE *uni2;
6151 /* startpos for collecting unencodable chars */
6152 const Py_UNICODE *collstart = p;
6153 const Py_UNICODE *collend = p;
6154 /* find all unecodable characters */
6155 while ((collend < endp) && ((*collend)>=limit))
6156 ++collend;
6157 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6158 if (known_errorHandler==-1) {
6159 if ((errors==NULL) || (!strcmp(errors, "strict")))
6160 known_errorHandler = 1;
6161 else if (!strcmp(errors, "replace"))
6162 known_errorHandler = 2;
6163 else if (!strcmp(errors, "ignore"))
6164 known_errorHandler = 3;
6165 else if (!strcmp(errors, "xmlcharrefreplace"))
6166 known_errorHandler = 4;
6167 else
6168 known_errorHandler = 0;
6169 }
6170 switch (known_errorHandler) {
6171 case 1: /* strict */
6172 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6173 goto onError;
6174 case 2: /* replace */
6175 while (collstart++<collend)
6176 *str++ = '?'; /* fall through */
6177 case 3: /* ignore */
6178 p = collend;
6179 break;
6180 case 4: /* xmlcharrefreplace */
6181 respos = str - PyBytes_AS_STRING(res);
6182 /* determine replacement size (temporarily (mis)uses p) */
6183 for (p = collstart, repsize = 0; p < collend; ++p) {
6184 if (*p<10)
6185 repsize += 2+1+1;
6186 else if (*p<100)
6187 repsize += 2+2+1;
6188 else if (*p<1000)
6189 repsize += 2+3+1;
6190 else if (*p<10000)
6191 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006192#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 else
6194 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006195#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006196 else if (*p<100000)
6197 repsize += 2+5+1;
6198 else if (*p<1000000)
6199 repsize += 2+6+1;
6200 else
6201 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006202#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 }
6204 requiredsize = respos+repsize+(endp-collend);
6205 if (requiredsize > ressize) {
6206 if (requiredsize<2*ressize)
6207 requiredsize = 2*ressize;
6208 if (_PyBytes_Resize(&res, requiredsize))
6209 goto onError;
6210 str = PyBytes_AS_STRING(res) + respos;
6211 ressize = requiredsize;
6212 }
6213 /* generate replacement (temporarily (mis)uses p) */
6214 for (p = collstart; p < collend; ++p) {
6215 str += sprintf(str, "&#%d;", (int)*p);
6216 }
6217 p = collend;
6218 break;
6219 default:
6220 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6221 encoding, reason, startp, size, &exc,
6222 collstart-startp, collend-startp, &newpos);
6223 if (repunicode == NULL)
6224 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006225 if (PyBytes_Check(repunicode)) {
6226 /* Directly copy bytes result to output. */
6227 repsize = PyBytes_Size(repunicode);
6228 if (repsize > 1) {
6229 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006230 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006231 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6232 Py_DECREF(repunicode);
6233 goto onError;
6234 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006235 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006236 ressize += repsize-1;
6237 }
6238 memcpy(str, PyBytes_AsString(repunicode), repsize);
6239 str += repsize;
6240 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006241 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006242 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006243 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 /* need more space? (at least enough for what we
6245 have+the replacement+the rest of the string, so
6246 we won't have to check space for encodable characters) */
6247 respos = str - PyBytes_AS_STRING(res);
6248 repsize = PyUnicode_GET_SIZE(repunicode);
6249 requiredsize = respos+repsize+(endp-collend);
6250 if (requiredsize > ressize) {
6251 if (requiredsize<2*ressize)
6252 requiredsize = 2*ressize;
6253 if (_PyBytes_Resize(&res, requiredsize)) {
6254 Py_DECREF(repunicode);
6255 goto onError;
6256 }
6257 str = PyBytes_AS_STRING(res) + respos;
6258 ressize = requiredsize;
6259 }
6260 /* check if there is anything unencodable in the replacement
6261 and copy it to the output */
6262 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6263 c = *uni2;
6264 if (c >= limit) {
6265 raise_encode_exception(&exc, encoding, startp, size,
6266 unicodepos, unicodepos+1, reason);
6267 Py_DECREF(repunicode);
6268 goto onError;
6269 }
6270 *str = (char)c;
6271 }
6272 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006273 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006275 }
6276 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006277 /* Resize if we allocated to much */
6278 size = str - PyBytes_AS_STRING(res);
6279 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006280 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006281 if (_PyBytes_Resize(&res, size) < 0)
6282 goto onError;
6283 }
6284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 Py_XDECREF(errorHandler);
6286 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006287 return res;
6288
6289 onError:
6290 Py_XDECREF(res);
6291 Py_XDECREF(errorHandler);
6292 Py_XDECREF(exc);
6293 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294}
6295
Alexander Belopolsky40018472011-02-26 01:02:56 +00006296PyObject *
6297PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006298 Py_ssize_t size,
6299 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302}
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006305_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306{
6307 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 PyErr_BadArgument();
6309 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006311 if (PyUnicode_READY(unicode) == -1)
6312 return NULL;
6313 /* Fast path: if it is a one-byte string, construct
6314 bytes object directly. */
6315 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6316 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6317 PyUnicode_GET_LENGTH(unicode));
6318 /* Non-Latin-1 characters present. Defer to above function to
6319 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006322 errors);
6323}
6324
6325PyObject*
6326PyUnicode_AsLatin1String(PyObject *unicode)
6327{
6328 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329}
6330
6331/* --- 7-bit ASCII Codec -------------------------------------------------- */
6332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333PyObject *
6334PyUnicode_DecodeASCII(const char *s,
6335 Py_ssize_t size,
6336 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 PyUnicodeObject *v;
6340 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006341 Py_ssize_t startinpos;
6342 Py_ssize_t endinpos;
6343 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006345 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 PyObject *errorHandler = NULL;
6347 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006348 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006351 if (size == 1 && *(unsigned char*)s < 128)
6352 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6353
6354 /* Fast path. Assume the input actually *is* ASCII, and allocate
6355 a single-block Unicode object with that assumption. If there is
6356 an error, drop the object and start over. */
6357 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6358 if (v == NULL)
6359 goto onError;
6360 d = PyUnicode_1BYTE_DATA(v);
6361 for (i = 0; i < size; i++) {
6362 unsigned char ch = ((unsigned char*)s)[i];
6363 if (ch < 128)
6364 d[i] = ch;
6365 else
6366 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006368 if (i == size)
6369 return (PyObject*)v;
6370 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006371
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 v = _PyUnicode_New(size);
6373 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 e = s + size;
6379 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 register unsigned char c = (unsigned char)*s;
6381 if (c < 128) {
6382 *p++ = c;
6383 ++s;
6384 }
6385 else {
6386 startinpos = s-starts;
6387 endinpos = startinpos + 1;
6388 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6389 if (unicode_decode_call_errorhandler(
6390 errors, &errorHandler,
6391 "ascii", "ordinal not in range(128)",
6392 &starts, &e, &startinpos, &endinpos, &exc, &s,
6393 &v, &outpos, &p))
6394 goto onError;
6395 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006397 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006398 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 Py_XDECREF(errorHandler);
6401 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006402 if (PyUnicode_READY(v) == -1) {
6403 Py_DECREF(v);
6404 return NULL;
6405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006407
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410 Py_XDECREF(errorHandler);
6411 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 return NULL;
6413}
6414
Alexander Belopolsky40018472011-02-26 01:02:56 +00006415PyObject *
6416PyUnicode_EncodeASCII(const Py_UNICODE *p,
6417 Py_ssize_t size,
6418 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421}
6422
Alexander Belopolsky40018472011-02-26 01:02:56 +00006423PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006424_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
6426 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 PyErr_BadArgument();
6428 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006430 if (PyUnicode_READY(unicode) == -1)
6431 return NULL;
6432 /* Fast path: if it is an ASCII-only string, construct bytes object
6433 directly. Else defer to above function to raise the exception. */
6434 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6435 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6436 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006439 errors);
6440}
6441
6442PyObject *
6443PyUnicode_AsASCIIString(PyObject *unicode)
6444{
6445 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446}
6447
Victor Stinner99b95382011-07-04 14:23:54 +02006448#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006449
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006450/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006451
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006452#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006453#define NEED_RETRY
6454#endif
6455
6456/* XXX This code is limited to "true" double-byte encodings, as
6457 a) it assumes an incomplete character consists of a single byte, and
6458 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006460
Alexander Belopolsky40018472011-02-26 01:02:56 +00006461static int
6462is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006463{
6464 const char *curr = s + offset;
6465
6466 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 const char *prev = CharPrev(s, curr);
6468 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006469 }
6470 return 0;
6471}
6472
6473/*
6474 * Decode MBCS string into unicode object. If 'final' is set, converts
6475 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6476 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006477static int
6478decode_mbcs(PyUnicodeObject **v,
6479 const char *s, /* MBCS string */
6480 int size, /* sizeof MBCS string */
6481 int final,
6482 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006483{
6484 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006485 Py_ssize_t n;
6486 DWORD usize;
6487 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006488
6489 assert(size >= 0);
6490
Victor Stinner554f3f02010-06-16 23:33:54 +00006491 /* check and handle 'errors' arg */
6492 if (errors==NULL || strcmp(errors, "strict")==0)
6493 flags = MB_ERR_INVALID_CHARS;
6494 else if (strcmp(errors, "ignore")==0)
6495 flags = 0;
6496 else {
6497 PyErr_Format(PyExc_ValueError,
6498 "mbcs encoding does not support errors='%s'",
6499 errors);
6500 return -1;
6501 }
6502
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006503 /* Skip trailing lead-byte unless 'final' is set */
6504 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006506
6507 /* First get the size of the result */
6508 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006509 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6510 if (usize==0)
6511 goto mbcs_decode_error;
6512 } else
6513 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006514
6515 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 /* Create unicode object */
6517 *v = _PyUnicode_New(usize);
6518 if (*v == NULL)
6519 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006520 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006521 }
6522 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 /* Extend unicode object */
6524 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006525 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006527 }
6528
6529 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006530 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006532 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6533 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006535 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006537
6538mbcs_decode_error:
6539 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6540 we raise a UnicodeDecodeError - else it is a 'generic'
6541 windows error
6542 */
6543 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6544 /* Ideally, we should get reason from FormatMessage - this
6545 is the Windows 2000 English version of the message
6546 */
6547 PyObject *exc = NULL;
6548 const char *reason = "No mapping for the Unicode character exists "
6549 "in the target multi-byte code page.";
6550 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6551 if (exc != NULL) {
6552 PyCodec_StrictErrors(exc);
6553 Py_DECREF(exc);
6554 }
6555 } else {
6556 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6557 }
6558 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006559}
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
6562PyUnicode_DecodeMBCSStateful(const char *s,
6563 Py_ssize_t size,
6564 const char *errors,
6565 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006566{
6567 PyUnicodeObject *v = NULL;
6568 int done;
6569
6570 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006572
6573#ifdef NEED_RETRY
6574 retry:
6575 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006576 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006577 else
6578#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006579 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006580
6581 if (done < 0) {
6582 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006584 }
6585
6586 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588
6589#ifdef NEED_RETRY
6590 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 s += done;
6592 size -= done;
6593 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 }
6595#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006596 if (PyUnicode_READY(v) == -1) {
6597 Py_DECREF(v);
6598 return NULL;
6599 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006600 return (PyObject *)v;
6601}
6602
Alexander Belopolsky40018472011-02-26 01:02:56 +00006603PyObject *
6604PyUnicode_DecodeMBCS(const char *s,
6605 Py_ssize_t size,
6606 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006607{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006608 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6609}
6610
6611/*
6612 * Convert unicode into string object (MBCS).
6613 * Returns 0 if succeed, -1 otherwise.
6614 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615static int
6616encode_mbcs(PyObject **repr,
6617 const Py_UNICODE *p, /* unicode */
6618 int size, /* size of unicode */
6619 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006620{
Victor Stinner554f3f02010-06-16 23:33:54 +00006621 BOOL usedDefaultChar = FALSE;
6622 BOOL *pusedDefaultChar;
6623 int mbcssize;
6624 Py_ssize_t n;
6625 PyObject *exc = NULL;
6626 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006627
6628 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006629
Victor Stinner554f3f02010-06-16 23:33:54 +00006630 /* check and handle 'errors' arg */
6631 if (errors==NULL || strcmp(errors, "strict")==0) {
6632 flags = WC_NO_BEST_FIT_CHARS;
6633 pusedDefaultChar = &usedDefaultChar;
6634 } else if (strcmp(errors, "replace")==0) {
6635 flags = 0;
6636 pusedDefaultChar = NULL;
6637 } else {
6638 PyErr_Format(PyExc_ValueError,
6639 "mbcs encoding does not support errors='%s'",
6640 errors);
6641 return -1;
6642 }
6643
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006644 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006645 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006646 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6647 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 if (mbcssize == 0) {
6649 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6650 return -1;
6651 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006652 /* If we used a default char, then we failed! */
6653 if (pusedDefaultChar && *pusedDefaultChar)
6654 goto mbcs_encode_error;
6655 } else {
6656 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006657 }
6658
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006659 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 /* Create string object */
6661 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6662 if (*repr == NULL)
6663 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006664 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006665 }
6666 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 /* Extend string object */
6668 n = PyBytes_Size(*repr);
6669 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6670 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006671 }
6672
6673 /* Do the conversion */
6674 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006676 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6677 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6679 return -1;
6680 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006681 if (pusedDefaultChar && *pusedDefaultChar)
6682 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006684 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006685
6686mbcs_encode_error:
6687 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6688 Py_XDECREF(exc);
6689 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006690}
6691
Alexander Belopolsky40018472011-02-26 01:02:56 +00006692PyObject *
6693PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6694 Py_ssize_t size,
6695 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006696{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697 PyObject *repr = NULL;
6698 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006699
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006703 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704 else
6705#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006706 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006707
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 Py_XDECREF(repr);
6710 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006711 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006712
6713#ifdef NEED_RETRY
6714 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 p += INT_MAX;
6716 size -= INT_MAX;
6717 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718 }
6719#endif
6720
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006721 return repr;
6722}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006723
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724PyObject *
6725PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006726{
6727 if (!PyUnicode_Check(unicode)) {
6728 PyErr_BadArgument();
6729 return NULL;
6730 }
6731 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 PyUnicode_GET_SIZE(unicode),
6733 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006734}
6735
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006736#undef NEED_RETRY
6737
Victor Stinner99b95382011-07-04 14:23:54 +02006738#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006739
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740/* --- Character Mapping Codec -------------------------------------------- */
6741
Alexander Belopolsky40018472011-02-26 01:02:56 +00006742PyObject *
6743PyUnicode_DecodeCharmap(const char *s,
6744 Py_ssize_t size,
6745 PyObject *mapping,
6746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006749 Py_ssize_t startinpos;
6750 Py_ssize_t endinpos;
6751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 PyUnicodeObject *v;
6754 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756 PyObject *errorHandler = NULL;
6757 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006758 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006759 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 /* Default to Latin-1 */
6762 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764
6765 v = _PyUnicode_New(size);
6766 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006772 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 mapstring = PyUnicode_AS_UNICODE(mapping);
6774 maplen = PyUnicode_GET_SIZE(mapping);
6775 while (s < e) {
6776 unsigned char ch = *s;
6777 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 if (ch < maplen)
6780 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 if (x == 0xfffe) {
6783 /* undefined mapping */
6784 outpos = p-PyUnicode_AS_UNICODE(v);
6785 startinpos = s-starts;
6786 endinpos = startinpos+1;
6787 if (unicode_decode_call_errorhandler(
6788 errors, &errorHandler,
6789 "charmap", "character maps to <undefined>",
6790 &starts, &e, &startinpos, &endinpos, &exc, &s,
6791 &v, &outpos, &p)) {
6792 goto onError;
6793 }
6794 continue;
6795 }
6796 *p++ = x;
6797 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006798 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006799 }
6800 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 while (s < e) {
6802 unsigned char ch = *s;
6803 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006804
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6806 w = PyLong_FromLong((long)ch);
6807 if (w == NULL)
6808 goto onError;
6809 x = PyObject_GetItem(mapping, w);
6810 Py_DECREF(w);
6811 if (x == NULL) {
6812 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6813 /* No mapping found means: mapping is undefined. */
6814 PyErr_Clear();
6815 x = Py_None;
6816 Py_INCREF(x);
6817 } else
6818 goto onError;
6819 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006820
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 /* Apply mapping */
6822 if (PyLong_Check(x)) {
6823 long value = PyLong_AS_LONG(x);
6824 if (value < 0 || value > 65535) {
6825 PyErr_SetString(PyExc_TypeError,
6826 "character mapping must be in range(65536)");
6827 Py_DECREF(x);
6828 goto onError;
6829 }
6830 *p++ = (Py_UNICODE)value;
6831 }
6832 else if (x == Py_None) {
6833 /* undefined mapping */
6834 outpos = p-PyUnicode_AS_UNICODE(v);
6835 startinpos = s-starts;
6836 endinpos = startinpos+1;
6837 if (unicode_decode_call_errorhandler(
6838 errors, &errorHandler,
6839 "charmap", "character maps to <undefined>",
6840 &starts, &e, &startinpos, &endinpos, &exc, &s,
6841 &v, &outpos, &p)) {
6842 Py_DECREF(x);
6843 goto onError;
6844 }
6845 Py_DECREF(x);
6846 continue;
6847 }
6848 else if (PyUnicode_Check(x)) {
6849 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006850
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 if (targetsize == 1)
6852 /* 1-1 mapping */
6853 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006854
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 else if (targetsize > 1) {
6856 /* 1-n mapping */
6857 if (targetsize > extrachars) {
6858 /* resize first */
6859 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6860 Py_ssize_t needed = (targetsize - extrachars) + \
6861 (targetsize << 2);
6862 extrachars += needed;
6863 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006864 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 PyUnicode_GET_SIZE(v) + needed) < 0) {
6866 Py_DECREF(x);
6867 goto onError;
6868 }
6869 p = PyUnicode_AS_UNICODE(v) + oldpos;
6870 }
6871 Py_UNICODE_COPY(p,
6872 PyUnicode_AS_UNICODE(x),
6873 targetsize);
6874 p += targetsize;
6875 extrachars -= targetsize;
6876 }
6877 /* 1-0 mapping: skip the character */
6878 }
6879 else {
6880 /* wrong return value */
6881 PyErr_SetString(PyExc_TypeError,
6882 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006883 Py_DECREF(x);
6884 goto onError;
6885 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 Py_DECREF(x);
6887 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
6890 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006891 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006893 Py_XDECREF(errorHandler);
6894 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006895 if (PyUnicode_READY(v) == -1) {
6896 Py_DECREF(v);
6897 return NULL;
6898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006900
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 Py_XDECREF(errorHandler);
6903 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 Py_XDECREF(v);
6905 return NULL;
6906}
6907
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006908/* Charmap encoding: the lookup table */
6909
Alexander Belopolsky40018472011-02-26 01:02:56 +00006910struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 PyObject_HEAD
6912 unsigned char level1[32];
6913 int count2, count3;
6914 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006915};
6916
6917static PyObject*
6918encoding_map_size(PyObject *obj, PyObject* args)
6919{
6920 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006921 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006923}
6924
6925static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006926 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 PyDoc_STR("Return the size (in bytes) of this object") },
6928 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006929};
6930
6931static void
6932encoding_map_dealloc(PyObject* o)
6933{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006934 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006935}
6936
6937static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006938 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 "EncodingMap", /*tp_name*/
6940 sizeof(struct encoding_map), /*tp_basicsize*/
6941 0, /*tp_itemsize*/
6942 /* methods */
6943 encoding_map_dealloc, /*tp_dealloc*/
6944 0, /*tp_print*/
6945 0, /*tp_getattr*/
6946 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006947 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 0, /*tp_repr*/
6949 0, /*tp_as_number*/
6950 0, /*tp_as_sequence*/
6951 0, /*tp_as_mapping*/
6952 0, /*tp_hash*/
6953 0, /*tp_call*/
6954 0, /*tp_str*/
6955 0, /*tp_getattro*/
6956 0, /*tp_setattro*/
6957 0, /*tp_as_buffer*/
6958 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6959 0, /*tp_doc*/
6960 0, /*tp_traverse*/
6961 0, /*tp_clear*/
6962 0, /*tp_richcompare*/
6963 0, /*tp_weaklistoffset*/
6964 0, /*tp_iter*/
6965 0, /*tp_iternext*/
6966 encoding_map_methods, /*tp_methods*/
6967 0, /*tp_members*/
6968 0, /*tp_getset*/
6969 0, /*tp_base*/
6970 0, /*tp_dict*/
6971 0, /*tp_descr_get*/
6972 0, /*tp_descr_set*/
6973 0, /*tp_dictoffset*/
6974 0, /*tp_init*/
6975 0, /*tp_alloc*/
6976 0, /*tp_new*/
6977 0, /*tp_free*/
6978 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006979};
6980
6981PyObject*
6982PyUnicode_BuildEncodingMap(PyObject* string)
6983{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006984 PyObject *result;
6985 struct encoding_map *mresult;
6986 int i;
6987 int need_dict = 0;
6988 unsigned char level1[32];
6989 unsigned char level2[512];
6990 unsigned char *mlevel1, *mlevel2, *mlevel3;
6991 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006992 int kind;
6993 void *data;
6994 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006996 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006997 PyErr_BadArgument();
6998 return NULL;
6999 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007000 kind = PyUnicode_KIND(string);
7001 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007002 memset(level1, 0xFF, sizeof level1);
7003 memset(level2, 0xFF, sizeof level2);
7004
7005 /* If there isn't a one-to-one mapping of NULL to \0,
7006 or if there are non-BMP characters, we need to use
7007 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007008 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007009 need_dict = 1;
7010 for (i = 1; i < 256; i++) {
7011 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007012 ch = PyUnicode_READ(kind, data, i);
7013 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007014 need_dict = 1;
7015 break;
7016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007017 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007018 /* unmapped character */
7019 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007020 l1 = ch >> 11;
7021 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007022 if (level1[l1] == 0xFF)
7023 level1[l1] = count2++;
7024 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007025 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007026 }
7027
7028 if (count2 >= 0xFF || count3 >= 0xFF)
7029 need_dict = 1;
7030
7031 if (need_dict) {
7032 PyObject *result = PyDict_New();
7033 PyObject *key, *value;
7034 if (!result)
7035 return NULL;
7036 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007038 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007039 if (!key || !value)
7040 goto failed1;
7041 if (PyDict_SetItem(result, key, value) == -1)
7042 goto failed1;
7043 Py_DECREF(key);
7044 Py_DECREF(value);
7045 }
7046 return result;
7047 failed1:
7048 Py_XDECREF(key);
7049 Py_XDECREF(value);
7050 Py_DECREF(result);
7051 return NULL;
7052 }
7053
7054 /* Create a three-level trie */
7055 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7056 16*count2 + 128*count3 - 1);
7057 if (!result)
7058 return PyErr_NoMemory();
7059 PyObject_Init(result, &EncodingMapType);
7060 mresult = (struct encoding_map*)result;
7061 mresult->count2 = count2;
7062 mresult->count3 = count3;
7063 mlevel1 = mresult->level1;
7064 mlevel2 = mresult->level23;
7065 mlevel3 = mresult->level23 + 16*count2;
7066 memcpy(mlevel1, level1, 32);
7067 memset(mlevel2, 0xFF, 16*count2);
7068 memset(mlevel3, 0, 128*count3);
7069 count3 = 0;
7070 for (i = 1; i < 256; i++) {
7071 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007072 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007073 /* unmapped character */
7074 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007075 o1 = PyUnicode_READ(kind, data, i)>>11;
7076 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007077 i2 = 16*mlevel1[o1] + o2;
7078 if (mlevel2[i2] == 0xFF)
7079 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007080 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007081 i3 = 128*mlevel2[i2] + o3;
7082 mlevel3[i3] = i;
7083 }
7084 return result;
7085}
7086
7087static int
7088encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7089{
7090 struct encoding_map *map = (struct encoding_map*)mapping;
7091 int l1 = c>>11;
7092 int l2 = (c>>7) & 0xF;
7093 int l3 = c & 0x7F;
7094 int i;
7095
7096#ifdef Py_UNICODE_WIDE
7097 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007099 }
7100#endif
7101 if (c == 0)
7102 return 0;
7103 /* level 1*/
7104 i = map->level1[l1];
7105 if (i == 0xFF) {
7106 return -1;
7107 }
7108 /* level 2*/
7109 i = map->level23[16*i+l2];
7110 if (i == 0xFF) {
7111 return -1;
7112 }
7113 /* level 3 */
7114 i = map->level23[16*map->count2 + 128*i + l3];
7115 if (i == 0) {
7116 return -1;
7117 }
7118 return i;
7119}
7120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121/* Lookup the character ch in the mapping. If the character
7122 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007123 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007124static PyObject *
7125charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126{
Christian Heimes217cfd12007-12-02 14:31:20 +00007127 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007128 PyObject *x;
7129
7130 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007132 x = PyObject_GetItem(mapping, w);
7133 Py_DECREF(w);
7134 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7136 /* No mapping found means: mapping is undefined. */
7137 PyErr_Clear();
7138 x = Py_None;
7139 Py_INCREF(x);
7140 return x;
7141 } else
7142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007144 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007146 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 long value = PyLong_AS_LONG(x);
7148 if (value < 0 || value > 255) {
7149 PyErr_SetString(PyExc_TypeError,
7150 "character mapping must be in range(256)");
7151 Py_DECREF(x);
7152 return NULL;
7153 }
7154 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007156 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007157 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 /* wrong return value */
7160 PyErr_Format(PyExc_TypeError,
7161 "character mapping must return integer, bytes or None, not %.400s",
7162 x->ob_type->tp_name);
7163 Py_DECREF(x);
7164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 }
7166}
7167
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007169charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007170{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007171 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7172 /* exponentially overallocate to minimize reallocations */
7173 if (requiredsize < 2*outsize)
7174 requiredsize = 2*outsize;
7175 if (_PyBytes_Resize(outobj, requiredsize))
7176 return -1;
7177 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007178}
7179
Benjamin Peterson14339b62009-01-31 16:36:08 +00007180typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007181 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007182} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007183/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007184 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185 space is available. Return a new reference to the object that
7186 was put in the output buffer, or Py_None, if the mapping was undefined
7187 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007188 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007189static charmapencode_result
7190charmapencode_output(Py_UNICODE c, PyObject *mapping,
7191 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007193 PyObject *rep;
7194 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007195 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007196
Christian Heimes90aa7642007-12-19 02:45:37 +00007197 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007198 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007200 if (res == -1)
7201 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 if (outsize<requiredsize)
7203 if (charmapencode_resize(outobj, outpos, requiredsize))
7204 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007205 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 outstart[(*outpos)++] = (char)res;
7207 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007208 }
7209
7210 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007211 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007213 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 Py_DECREF(rep);
7215 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007216 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 if (PyLong_Check(rep)) {
7218 Py_ssize_t requiredsize = *outpos+1;
7219 if (outsize<requiredsize)
7220 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7221 Py_DECREF(rep);
7222 return enc_EXCEPTION;
7223 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007224 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007226 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 else {
7228 const char *repchars = PyBytes_AS_STRING(rep);
7229 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7230 Py_ssize_t requiredsize = *outpos+repsize;
7231 if (outsize<requiredsize)
7232 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7233 Py_DECREF(rep);
7234 return enc_EXCEPTION;
7235 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007236 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 memcpy(outstart + *outpos, repchars, repsize);
7238 *outpos += repsize;
7239 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007240 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007241 Py_DECREF(rep);
7242 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007243}
7244
7245/* handle an error in PyUnicode_EncodeCharmap
7246 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007247static int
7248charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007251 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007252 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007253{
7254 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 Py_ssize_t repsize;
7256 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007257 Py_UNICODE *uni2;
7258 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259 Py_ssize_t collstartpos = *inpos;
7260 Py_ssize_t collendpos = *inpos+1;
7261 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 char *encoding = "charmap";
7263 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007264 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007266 /* find all unencodable characters */
7267 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007268 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007269 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 int res = encoding_map_lookup(p[collendpos], mapping);
7271 if (res != -1)
7272 break;
7273 ++collendpos;
7274 continue;
7275 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007276
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 rep = charmapencode_lookup(p[collendpos], mapping);
7278 if (rep==NULL)
7279 return -1;
7280 else if (rep!=Py_None) {
7281 Py_DECREF(rep);
7282 break;
7283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007284 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007285 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007286 }
7287 /* cache callback name lookup
7288 * (if not done yet, i.e. it's the first error) */
7289 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 if ((errors==NULL) || (!strcmp(errors, "strict")))
7291 *known_errorHandler = 1;
7292 else if (!strcmp(errors, "replace"))
7293 *known_errorHandler = 2;
7294 else if (!strcmp(errors, "ignore"))
7295 *known_errorHandler = 3;
7296 else if (!strcmp(errors, "xmlcharrefreplace"))
7297 *known_errorHandler = 4;
7298 else
7299 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007300 }
7301 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007302 case 1: /* strict */
7303 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7304 return -1;
7305 case 2: /* replace */
7306 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 x = charmapencode_output('?', mapping, res, respos);
7308 if (x==enc_EXCEPTION) {
7309 return -1;
7310 }
7311 else if (x==enc_FAILED) {
7312 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7313 return -1;
7314 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007315 }
7316 /* fall through */
7317 case 3: /* ignore */
7318 *inpos = collendpos;
7319 break;
7320 case 4: /* xmlcharrefreplace */
7321 /* generate replacement (temporarily (mis)uses p) */
7322 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 char buffer[2+29+1+1];
7324 char *cp;
7325 sprintf(buffer, "&#%d;", (int)p[collpos]);
7326 for (cp = buffer; *cp; ++cp) {
7327 x = charmapencode_output(*cp, mapping, res, respos);
7328 if (x==enc_EXCEPTION)
7329 return -1;
7330 else if (x==enc_FAILED) {
7331 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7332 return -1;
7333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007334 }
7335 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007336 *inpos = collendpos;
7337 break;
7338 default:
7339 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 encoding, reason, p, size, exceptionObject,
7341 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007344 if (PyBytes_Check(repunicode)) {
7345 /* Directly copy bytes result to output. */
7346 Py_ssize_t outsize = PyBytes_Size(*res);
7347 Py_ssize_t requiredsize;
7348 repsize = PyBytes_Size(repunicode);
7349 requiredsize = *respos + repsize;
7350 if (requiredsize > outsize)
7351 /* Make room for all additional bytes. */
7352 if (charmapencode_resize(res, respos, requiredsize)) {
7353 Py_DECREF(repunicode);
7354 return -1;
7355 }
7356 memcpy(PyBytes_AsString(*res) + *respos,
7357 PyBytes_AsString(repunicode), repsize);
7358 *respos += repsize;
7359 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007360 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007361 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007362 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007363 /* generate replacement */
7364 repsize = PyUnicode_GET_SIZE(repunicode);
7365 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 x = charmapencode_output(*uni2, mapping, res, respos);
7367 if (x==enc_EXCEPTION) {
7368 return -1;
7369 }
7370 else if (x==enc_FAILED) {
7371 Py_DECREF(repunicode);
7372 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7373 return -1;
7374 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007375 }
7376 *inpos = newpos;
7377 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007378 }
7379 return 0;
7380}
7381
Alexander Belopolsky40018472011-02-26 01:02:56 +00007382PyObject *
7383PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7384 Py_ssize_t size,
7385 PyObject *mapping,
7386 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 /* output object */
7389 PyObject *res = NULL;
7390 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007392 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007393 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007394 PyObject *errorHandler = NULL;
7395 PyObject *exc = NULL;
7396 /* the following variable is used for caching string comparisons
7397 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7398 * 3=ignore, 4=xmlcharrefreplace */
7399 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400
7401 /* Default to Latin-1 */
7402 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007405 /* allocate enough for a simple encoding without
7406 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007407 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007408 if (res == NULL)
7409 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007410 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007413 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 /* try to encode it */
7415 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7416 if (x==enc_EXCEPTION) /* error */
7417 goto onError;
7418 if (x==enc_FAILED) { /* unencodable character */
7419 if (charmap_encoding_error(p, size, &inpos, mapping,
7420 &exc,
7421 &known_errorHandler, &errorHandler, errors,
7422 &res, &respos)) {
7423 goto onError;
7424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007425 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 else
7427 /* done with this character => adjust input position */
7428 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007431 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007432 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007433 if (_PyBytes_Resize(&res, respos) < 0)
7434 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007435
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007436 Py_XDECREF(exc);
7437 Py_XDECREF(errorHandler);
7438 return res;
7439
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441 Py_XDECREF(res);
7442 Py_XDECREF(exc);
7443 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 return NULL;
7445}
7446
Alexander Belopolsky40018472011-02-26 01:02:56 +00007447PyObject *
7448PyUnicode_AsCharmapString(PyObject *unicode,
7449 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450{
7451 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 PyErr_BadArgument();
7453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 }
7455 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 PyUnicode_GET_SIZE(unicode),
7457 mapping,
7458 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459}
7460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007462static void
7463make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007464 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007465 Py_ssize_t startpos, Py_ssize_t endpos,
7466 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007468 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 *exceptionObject = _PyUnicodeTranslateError_Create(
7470 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 }
7472 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7474 goto onError;
7475 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7476 goto onError;
7477 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7478 goto onError;
7479 return;
7480 onError:
7481 Py_DECREF(*exceptionObject);
7482 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 }
7484}
7485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007487static void
7488raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007489 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007490 Py_ssize_t startpos, Py_ssize_t endpos,
7491 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492{
7493 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007494 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007495 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007497}
7498
7499/* error handling callback helper:
7500 build arguments, call the callback and check the arguments,
7501 put the result into newpos and return the replacement string, which
7502 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007503static PyObject *
7504unicode_translate_call_errorhandler(const char *errors,
7505 PyObject **errorHandler,
7506 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007508 Py_ssize_t startpos, Py_ssize_t endpos,
7509 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007511 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007512
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007513 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007514 PyObject *restuple;
7515 PyObject *resunicode;
7516
7517 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521 }
7522
7523 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007524 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007525 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527
7528 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007532 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007533 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 Py_DECREF(restuple);
7535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007536 }
7537 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 &resunicode, &i_newpos)) {
7539 Py_DECREF(restuple);
7540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007541 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007543 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007544 else
7545 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007546 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7548 Py_DECREF(restuple);
7549 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007550 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 Py_INCREF(resunicode);
7552 Py_DECREF(restuple);
7553 return resunicode;
7554}
7555
7556/* Lookup the character ch in the mapping and put the result in result,
7557 which must be decrefed by the caller.
7558 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007559static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007560charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561{
Christian Heimes217cfd12007-12-02 14:31:20 +00007562 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007563 PyObject *x;
7564
7565 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567 x = PyObject_GetItem(mapping, w);
7568 Py_DECREF(w);
7569 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7571 /* No mapping found means: use 1:1 mapping. */
7572 PyErr_Clear();
7573 *result = NULL;
7574 return 0;
7575 } else
7576 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577 }
7578 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 *result = x;
7580 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007582 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 long value = PyLong_AS_LONG(x);
7584 long max = PyUnicode_GetMax();
7585 if (value < 0 || value > max) {
7586 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007587 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 Py_DECREF(x);
7589 return -1;
7590 }
7591 *result = x;
7592 return 0;
7593 }
7594 else if (PyUnicode_Check(x)) {
7595 *result = x;
7596 return 0;
7597 }
7598 else {
7599 /* wrong return value */
7600 PyErr_SetString(PyExc_TypeError,
7601 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007602 Py_DECREF(x);
7603 return -1;
7604 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007605}
7606/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 if not reallocate and adjust various state variables.
7608 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007609static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007610charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007613 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007614 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 /* exponentially overallocate to minimize reallocations */
7616 if (requiredsize < 2 * oldsize)
7617 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007618 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7619 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007621 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007622 }
7623 return 0;
7624}
7625/* lookup the character, put the result in the output string and adjust
7626 various state variables. Return a new reference to the object that
7627 was put in the output buffer in *result, or Py_None, if the mapping was
7628 undefined (in which case no character was written).
7629 The called must decref result.
7630 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007631static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007632charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7633 PyObject *mapping, Py_UCS4 **output,
7634 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007635 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007637 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7638 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007642 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643 }
7644 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007646 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007648 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007649 }
7650 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007651 Py_ssize_t repsize;
7652 if (PyUnicode_READY(*res) == -1)
7653 return -1;
7654 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 if (repsize==1) {
7656 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007657 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 }
7659 else if (repsize!=0) {
7660 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007661 Py_ssize_t requiredsize = *opos +
7662 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007664 Py_ssize_t i;
7665 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007667 for(i = 0; i < repsize; i++)
7668 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007670 }
7671 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007673 return 0;
7674}
7675
Alexander Belopolsky40018472011-02-26 01:02:56 +00007676PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007677_PyUnicode_TranslateCharmap(PyObject *input,
7678 PyObject *mapping,
7679 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007681 /* input object */
7682 char *idata;
7683 Py_ssize_t size, i;
7684 int kind;
7685 /* output buffer */
7686 Py_UCS4 *output = NULL;
7687 Py_ssize_t osize;
7688 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007690 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007691 char *reason = "character maps to <undefined>";
7692 PyObject *errorHandler = NULL;
7693 PyObject *exc = NULL;
7694 /* the following variable is used for caching string comparisons
7695 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7696 * 3=ignore, 4=xmlcharrefreplace */
7697 int known_errorHandler = -1;
7698
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 PyErr_BadArgument();
7701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007704 if (PyUnicode_READY(input) == -1)
7705 return NULL;
7706 idata = (char*)PyUnicode_DATA(input);
7707 kind = PyUnicode_KIND(input);
7708 size = PyUnicode_GET_LENGTH(input);
7709 i = 0;
7710
7711 if (size == 0) {
7712 Py_INCREF(input);
7713 return input;
7714 }
7715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716 /* allocate enough for a simple 1:1 translation without
7717 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007718 osize = size;
7719 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7720 opos = 0;
7721 if (output == NULL) {
7722 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007726 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 /* try to encode it */
7728 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007729 if (charmaptranslate_output(input, i, mapping,
7730 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 Py_XDECREF(x);
7732 goto onError;
7733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007736 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 else { /* untranslatable character */
7738 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7739 Py_ssize_t repsize;
7740 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007741 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007743 Py_ssize_t collstart = i;
7744 Py_ssize_t collend = i+1;
7745 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007748 while (collend < size) {
7749 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 goto onError;
7751 Py_XDECREF(x);
7752 if (x!=Py_None)
7753 break;
7754 ++collend;
7755 }
7756 /* cache callback name lookup
7757 * (if not done yet, i.e. it's the first error) */
7758 if (known_errorHandler==-1) {
7759 if ((errors==NULL) || (!strcmp(errors, "strict")))
7760 known_errorHandler = 1;
7761 else if (!strcmp(errors, "replace"))
7762 known_errorHandler = 2;
7763 else if (!strcmp(errors, "ignore"))
7764 known_errorHandler = 3;
7765 else if (!strcmp(errors, "xmlcharrefreplace"))
7766 known_errorHandler = 4;
7767 else
7768 known_errorHandler = 0;
7769 }
7770 switch (known_errorHandler) {
7771 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007772 raise_translate_exception(&exc, input, collstart,
7773 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 case 2: /* replace */
7776 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007777 for (coll = collstart; coll<collend; coll++)
7778 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 /* fall through */
7780 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007781 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 break;
7783 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 /* generate replacement (temporarily (mis)uses i) */
7785 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 char buffer[2+29+1+1];
7787 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7789 if (charmaptranslate_makespace(&output, &osize,
7790 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 goto onError;
7792 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007795 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 break;
7797 default:
7798 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 reason, input, &exc,
7800 collstart, collend, &newpos);
7801 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 goto onError;
7803 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 repsize = PyUnicode_GET_LENGTH(repunicode);
7805 if (charmaptranslate_makespace(&output, &osize,
7806 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 Py_DECREF(repunicode);
7808 goto onError;
7809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 for (uni2 = 0; repsize-->0; ++uni2)
7811 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7812 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007815 }
7816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7818 if (!res)
7819 goto onError;
7820 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 Py_XDECREF(exc);
7822 Py_XDECREF(errorHandler);
7823 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 Py_XDECREF(exc);
7828 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 return NULL;
7830}
7831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832/* Deprecated. Use PyUnicode_Translate instead. */
7833PyObject *
7834PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7835 Py_ssize_t size,
7836 PyObject *mapping,
7837 const char *errors)
7838{
7839 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7840 if (!unicode)
7841 return NULL;
7842 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7843}
7844
Alexander Belopolsky40018472011-02-26 01:02:56 +00007845PyObject *
7846PyUnicode_Translate(PyObject *str,
7847 PyObject *mapping,
7848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849{
7850 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007851
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 str = PyUnicode_FromObject(str);
7853 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 Py_DECREF(str);
7857 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007858
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 Py_XDECREF(str);
7861 return NULL;
7862}
Tim Petersced69f82003-09-16 20:30:58 +00007863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864static Py_UCS4
7865fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7866{
7867 /* No need to call PyUnicode_READY(self) because this function is only
7868 called as a callback from fixup() which does it already. */
7869 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7870 const int kind = PyUnicode_KIND(self);
7871 void *data = PyUnicode_DATA(self);
7872 Py_UCS4 maxchar = 0, ch, fixed;
7873 Py_ssize_t i;
7874
7875 for (i = 0; i < len; ++i) {
7876 ch = PyUnicode_READ(kind, data, i);
7877 fixed = 0;
7878 if (ch > 127) {
7879 if (Py_UNICODE_ISSPACE(ch))
7880 fixed = ' ';
7881 else {
7882 const int decimal = Py_UNICODE_TODECIMAL(ch);
7883 if (decimal >= 0)
7884 fixed = '0' + decimal;
7885 }
7886 if (fixed != 0) {
7887 if (fixed > maxchar)
7888 maxchar = fixed;
7889 PyUnicode_WRITE(kind, data, i, fixed);
7890 }
7891 else if (ch > maxchar)
7892 maxchar = ch;
7893 }
7894 else if (ch > maxchar)
7895 maxchar = ch;
7896 }
7897
7898 return maxchar;
7899}
7900
7901PyObject *
7902_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7903{
7904 if (!PyUnicode_Check(unicode)) {
7905 PyErr_BadInternalCall();
7906 return NULL;
7907 }
7908 if (PyUnicode_READY(unicode) == -1)
7909 return NULL;
7910 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7911 /* If the string is already ASCII, just return the same string */
7912 Py_INCREF(unicode);
7913 return unicode;
7914 }
7915 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7916}
7917
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007918PyObject *
7919PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7920 Py_ssize_t length)
7921{
7922 PyObject *result;
7923 Py_UNICODE *p; /* write pointer into result */
7924 Py_ssize_t i;
7925 /* Copy to a new string */
7926 result = (PyObject *)_PyUnicode_New(length);
7927 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7928 if (result == NULL)
7929 return result;
7930 p = PyUnicode_AS_UNICODE(result);
7931 /* Iterate over code points */
7932 for (i = 0; i < length; i++) {
7933 Py_UNICODE ch =s[i];
7934 if (ch > 127) {
7935 int decimal = Py_UNICODE_TODECIMAL(ch);
7936 if (decimal >= 0)
7937 p[i] = '0' + decimal;
7938 }
7939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007940 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7941 Py_DECREF(result);
7942 return NULL;
7943 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007944 return result;
7945}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007946/* --- Decimal Encoder ---------------------------------------------------- */
7947
Alexander Belopolsky40018472011-02-26 01:02:56 +00007948int
7949PyUnicode_EncodeDecimal(Py_UNICODE *s,
7950 Py_ssize_t length,
7951 char *output,
7952 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007953{
7954 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955 PyObject *errorHandler = NULL;
7956 PyObject *exc = NULL;
7957 const char *encoding = "decimal";
7958 const char *reason = "invalid decimal Unicode string";
7959 /* the following variable is used for caching string comparisons
7960 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7961 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007962
7963 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 PyErr_BadArgument();
7965 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007966 }
7967
7968 p = s;
7969 end = s + length;
7970 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 register Py_UNICODE ch = *p;
7972 int decimal;
7973 PyObject *repunicode;
7974 Py_ssize_t repsize;
7975 Py_ssize_t newpos;
7976 Py_UNICODE *uni2;
7977 Py_UNICODE *collstart;
7978 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007979
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007981 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 ++p;
7983 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 decimal = Py_UNICODE_TODECIMAL(ch);
7986 if (decimal >= 0) {
7987 *output++ = '0' + decimal;
7988 ++p;
7989 continue;
7990 }
7991 if (0 < ch && ch < 256) {
7992 *output++ = (char)ch;
7993 ++p;
7994 continue;
7995 }
7996 /* All other characters are considered unencodable */
7997 collstart = p;
7998 collend = p+1;
7999 while (collend < end) {
8000 if ((0 < *collend && *collend < 256) ||
8001 !Py_UNICODE_ISSPACE(*collend) ||
8002 Py_UNICODE_TODECIMAL(*collend))
8003 break;
8004 }
8005 /* cache callback name lookup
8006 * (if not done yet, i.e. it's the first error) */
8007 if (known_errorHandler==-1) {
8008 if ((errors==NULL) || (!strcmp(errors, "strict")))
8009 known_errorHandler = 1;
8010 else if (!strcmp(errors, "replace"))
8011 known_errorHandler = 2;
8012 else if (!strcmp(errors, "ignore"))
8013 known_errorHandler = 3;
8014 else if (!strcmp(errors, "xmlcharrefreplace"))
8015 known_errorHandler = 4;
8016 else
8017 known_errorHandler = 0;
8018 }
8019 switch (known_errorHandler) {
8020 case 1: /* strict */
8021 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8022 goto onError;
8023 case 2: /* replace */
8024 for (p = collstart; p < collend; ++p)
8025 *output++ = '?';
8026 /* fall through */
8027 case 3: /* ignore */
8028 p = collend;
8029 break;
8030 case 4: /* xmlcharrefreplace */
8031 /* generate replacement (temporarily (mis)uses p) */
8032 for (p = collstart; p < collend; ++p)
8033 output += sprintf(output, "&#%d;", (int)*p);
8034 p = collend;
8035 break;
8036 default:
8037 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8038 encoding, reason, s, length, &exc,
8039 collstart-s, collend-s, &newpos);
8040 if (repunicode == NULL)
8041 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008042 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008043 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008044 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8045 Py_DECREF(repunicode);
8046 goto onError;
8047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 /* generate replacement */
8049 repsize = PyUnicode_GET_SIZE(repunicode);
8050 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8051 Py_UNICODE ch = *uni2;
8052 if (Py_UNICODE_ISSPACE(ch))
8053 *output++ = ' ';
8054 else {
8055 decimal = Py_UNICODE_TODECIMAL(ch);
8056 if (decimal >= 0)
8057 *output++ = '0' + decimal;
8058 else if (0 < ch && ch < 256)
8059 *output++ = (char)ch;
8060 else {
8061 Py_DECREF(repunicode);
8062 raise_encode_exception(&exc, encoding,
8063 s, length, collstart-s, collend-s, reason);
8064 goto onError;
8065 }
8066 }
8067 }
8068 p = s + newpos;
8069 Py_DECREF(repunicode);
8070 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008071 }
8072 /* 0-terminate the output string */
8073 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074 Py_XDECREF(exc);
8075 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008076 return 0;
8077
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079 Py_XDECREF(exc);
8080 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008081 return -1;
8082}
8083
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084/* --- Helpers ------------------------------------------------------------ */
8085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086#include "stringlib/ucs1lib.h"
8087#include "stringlib/fastsearch.h"
8088#include "stringlib/partition.h"
8089#include "stringlib/split.h"
8090#include "stringlib/count.h"
8091#include "stringlib/find.h"
8092#include "stringlib/localeutil.h"
8093#include "stringlib/undef.h"
8094
8095#include "stringlib/ucs2lib.h"
8096#include "stringlib/fastsearch.h"
8097#include "stringlib/partition.h"
8098#include "stringlib/split.h"
8099#include "stringlib/count.h"
8100#include "stringlib/find.h"
8101#include "stringlib/localeutil.h"
8102#include "stringlib/undef.h"
8103
8104#include "stringlib/ucs4lib.h"
8105#include "stringlib/fastsearch.h"
8106#include "stringlib/partition.h"
8107#include "stringlib/split.h"
8108#include "stringlib/count.h"
8109#include "stringlib/find.h"
8110#include "stringlib/localeutil.h"
8111#include "stringlib/undef.h"
8112
8113static Py_ssize_t
8114any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8115 const Py_UCS1*, Py_ssize_t,
8116 Py_ssize_t, Py_ssize_t),
8117 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8118 const Py_UCS2*, Py_ssize_t,
8119 Py_ssize_t, Py_ssize_t),
8120 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8121 const Py_UCS4*, Py_ssize_t,
8122 Py_ssize_t, Py_ssize_t),
8123 PyObject* s1, PyObject* s2,
8124 Py_ssize_t start,
8125 Py_ssize_t end)
8126{
8127 int kind1, kind2, kind;
8128 void *buf1, *buf2;
8129 Py_ssize_t len1, len2, result;
8130
8131 kind1 = PyUnicode_KIND(s1);
8132 kind2 = PyUnicode_KIND(s2);
8133 kind = kind1 > kind2 ? kind1 : kind2;
8134 buf1 = PyUnicode_DATA(s1);
8135 buf2 = PyUnicode_DATA(s2);
8136 if (kind1 != kind)
8137 buf1 = _PyUnicode_AsKind(s1, kind);
8138 if (!buf1)
8139 return -2;
8140 if (kind2 != kind)
8141 buf2 = _PyUnicode_AsKind(s2, kind);
8142 if (!buf2) {
8143 if (kind1 != kind) PyMem_Free(buf1);
8144 return -2;
8145 }
8146 len1 = PyUnicode_GET_LENGTH(s1);
8147 len2 = PyUnicode_GET_LENGTH(s2);
8148
8149 switch(kind) {
8150 case PyUnicode_1BYTE_KIND:
8151 result = ucs1(buf1, len1, buf2, len2, start, end);
8152 break;
8153 case PyUnicode_2BYTE_KIND:
8154 result = ucs2(buf1, len1, buf2, len2, start, end);
8155 break;
8156 case PyUnicode_4BYTE_KIND:
8157 result = ucs4(buf1, len1, buf2, len2, start, end);
8158 break;
8159 default:
8160 assert(0); result = -2;
8161 }
8162
8163 if (kind1 != kind)
8164 PyMem_Free(buf1);
8165 if (kind2 != kind)
8166 PyMem_Free(buf2);
8167
8168 return result;
8169}
8170
8171Py_ssize_t
8172_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8173 Py_ssize_t n_buffer,
8174 void *digits, Py_ssize_t n_digits,
8175 Py_ssize_t min_width,
8176 const char *grouping,
8177 const char *thousands_sep)
8178{
8179 switch(kind) {
8180 case PyUnicode_1BYTE_KIND:
8181 return _PyUnicode_ucs1_InsertThousandsGrouping(
8182 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8183 min_width, grouping, thousands_sep);
8184 case PyUnicode_2BYTE_KIND:
8185 return _PyUnicode_ucs2_InsertThousandsGrouping(
8186 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8187 min_width, grouping, thousands_sep);
8188 case PyUnicode_4BYTE_KIND:
8189 return _PyUnicode_ucs4_InsertThousandsGrouping(
8190 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8191 min_width, grouping, thousands_sep);
8192 }
8193 assert(0);
8194 return -1;
8195}
8196
8197
Eric Smith8c663262007-08-25 02:26:07 +00008198#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008199#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008200
Thomas Wouters477c8d52006-05-27 19:21:47 +00008201#include "stringlib/count.h"
8202#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008203
Thomas Wouters477c8d52006-05-27 19:21:47 +00008204/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008205#define ADJUST_INDICES(start, end, len) \
8206 if (end > len) \
8207 end = len; \
8208 else if (end < 0) { \
8209 end += len; \
8210 if (end < 0) \
8211 end = 0; \
8212 } \
8213 if (start < 0) { \
8214 start += len; \
8215 if (start < 0) \
8216 start = 0; \
8217 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008218
Alexander Belopolsky40018472011-02-26 01:02:56 +00008219Py_ssize_t
8220PyUnicode_Count(PyObject *str,
8221 PyObject *substr,
8222 Py_ssize_t start,
8223 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008225 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008226 PyUnicodeObject* str_obj;
8227 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 int kind1, kind2, kind;
8229 void *buf1 = NULL, *buf2 = NULL;
8230 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008231
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008235 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008236 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 Py_DECREF(str_obj);
8238 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239 }
Tim Petersced69f82003-09-16 20:30:58 +00008240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 kind1 = PyUnicode_KIND(str_obj);
8242 kind2 = PyUnicode_KIND(sub_obj);
8243 kind = kind1 > kind2 ? kind1 : kind2;
8244 buf1 = PyUnicode_DATA(str_obj);
8245 if (kind1 != kind)
8246 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8247 if (!buf1)
8248 goto onError;
8249 buf2 = PyUnicode_DATA(sub_obj);
8250 if (kind2 != kind)
8251 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8252 if (!buf2)
8253 goto onError;
8254 len1 = PyUnicode_GET_LENGTH(str_obj);
8255 len2 = PyUnicode_GET_LENGTH(sub_obj);
8256
8257 ADJUST_INDICES(start, end, len1);
8258 switch(kind) {
8259 case PyUnicode_1BYTE_KIND:
8260 result = ucs1lib_count(
8261 ((Py_UCS1*)buf1) + start, end - start,
8262 buf2, len2, PY_SSIZE_T_MAX
8263 );
8264 break;
8265 case PyUnicode_2BYTE_KIND:
8266 result = ucs2lib_count(
8267 ((Py_UCS2*)buf1) + start, end - start,
8268 buf2, len2, PY_SSIZE_T_MAX
8269 );
8270 break;
8271 case PyUnicode_4BYTE_KIND:
8272 result = ucs4lib_count(
8273 ((Py_UCS4*)buf1) + start, end - start,
8274 buf2, len2, PY_SSIZE_T_MAX
8275 );
8276 break;
8277 default:
8278 assert(0); result = 0;
8279 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008280
8281 Py_DECREF(sub_obj);
8282 Py_DECREF(str_obj);
8283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 if (kind1 != kind)
8285 PyMem_Free(buf1);
8286 if (kind2 != kind)
8287 PyMem_Free(buf2);
8288
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 onError:
8291 Py_DECREF(sub_obj);
8292 Py_DECREF(str_obj);
8293 if (kind1 != kind && buf1)
8294 PyMem_Free(buf1);
8295 if (kind2 != kind && buf2)
8296 PyMem_Free(buf2);
8297 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298}
8299
Alexander Belopolsky40018472011-02-26 01:02:56 +00008300Py_ssize_t
8301PyUnicode_Find(PyObject *str,
8302 PyObject *sub,
8303 Py_ssize_t start,
8304 Py_ssize_t end,
8305 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008307 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008308
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008312 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 Py_DECREF(str);
8315 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 }
Tim Petersced69f82003-09-16 20:30:58 +00008317
Thomas Wouters477c8d52006-05-27 19:21:47 +00008318 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 result = any_find_slice(
8320 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8321 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008322 );
8323 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008324 result = any_find_slice(
8325 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8326 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008327 );
8328
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008330 Py_DECREF(sub);
8331
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 return result;
8333}
8334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335Py_ssize_t
8336PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8337 Py_ssize_t start, Py_ssize_t end,
8338 int direction)
8339{
8340 char *result;
8341 int kind;
8342 if (PyUnicode_READY(str) == -1)
8343 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008344 if (start < 0 || end < 0) {
8345 PyErr_SetString(PyExc_IndexError, "string index out of range");
8346 return -2;
8347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 if (end > PyUnicode_GET_LENGTH(str))
8349 end = PyUnicode_GET_LENGTH(str);
8350 kind = PyUnicode_KIND(str);
8351 result = findchar(PyUnicode_1BYTE_DATA(str)
8352 + PyUnicode_KIND_SIZE(kind, start),
8353 kind,
8354 end-start, ch, direction);
8355 if (!result)
8356 return -1;
8357 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8358}
8359
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360static int
8361tailmatch(PyUnicodeObject *self,
8362 PyUnicodeObject *substring,
8363 Py_ssize_t start,
8364 Py_ssize_t end,
8365 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 int kind_self;
8368 int kind_sub;
8369 void *data_self;
8370 void *data_sub;
8371 Py_ssize_t offset;
8372 Py_ssize_t i;
8373 Py_ssize_t end_sub;
8374
8375 if (PyUnicode_READY(self) == -1 ||
8376 PyUnicode_READY(substring) == -1)
8377 return 0;
8378
8379 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 return 1;
8381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8383 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 kind_self = PyUnicode_KIND(self);
8388 data_self = PyUnicode_DATA(self);
8389 kind_sub = PyUnicode_KIND(substring);
8390 data_sub = PyUnicode_DATA(substring);
8391 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8392
8393 if (direction > 0)
8394 offset = end;
8395 else
8396 offset = start;
8397
8398 if (PyUnicode_READ(kind_self, data_self, offset) ==
8399 PyUnicode_READ(kind_sub, data_sub, 0) &&
8400 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8401 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8402 /* If both are of the same kind, memcmp is sufficient */
8403 if (kind_self == kind_sub) {
8404 return ! memcmp((char *)data_self +
8405 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8406 data_sub,
8407 PyUnicode_GET_LENGTH(substring) *
8408 PyUnicode_CHARACTER_SIZE(substring));
8409 }
8410 /* otherwise we have to compare each character by first accesing it */
8411 else {
8412 /* We do not need to compare 0 and len(substring)-1 because
8413 the if statement above ensured already that they are equal
8414 when we end up here. */
8415 // TODO: honor direction and do a forward or backwards search
8416 for (i = 1; i < end_sub; ++i) {
8417 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8418 PyUnicode_READ(kind_sub, data_sub, i))
8419 return 0;
8420 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 }
8424
8425 return 0;
8426}
8427
Alexander Belopolsky40018472011-02-26 01:02:56 +00008428Py_ssize_t
8429PyUnicode_Tailmatch(PyObject *str,
8430 PyObject *substr,
8431 Py_ssize_t start,
8432 Py_ssize_t end,
8433 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008436
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 str = PyUnicode_FromObject(str);
8438 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 substr = PyUnicode_FromObject(substr);
8441 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 Py_DECREF(str);
8443 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 }
Tim Petersced69f82003-09-16 20:30:58 +00008445
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 (PyUnicodeObject *)substr,
8448 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 Py_DECREF(str);
8450 Py_DECREF(substr);
8451 return result;
8452}
8453
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454/* Apply fixfct filter to the Unicode object self and return a
8455 reference to the modified object */
8456
Alexander Belopolsky40018472011-02-26 01:02:56 +00008457static PyObject *
8458fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 PyObject *u;
8462 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 if (PyUnicode_READY(self) == -1)
8465 return NULL;
8466 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8467 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8468 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8473 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 /* fix functions return the new maximum character in a string,
8476 if the kind of the resulting unicode object does not change,
8477 everything is fine. Otherwise we need to change the string kind
8478 and re-run the fix function. */
8479 maxchar_new = fixfct((PyUnicodeObject*)u);
8480 if (maxchar_new == 0)
8481 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8482 else if (maxchar_new <= 127)
8483 maxchar_new = 127;
8484 else if (maxchar_new <= 255)
8485 maxchar_new = 255;
8486 else if (maxchar_new <= 65535)
8487 maxchar_new = 65535;
8488 else
8489 maxchar_new = 1114111; /* 0x10ffff */
8490
8491 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 /* fixfct should return TRUE if it modified the buffer. If
8493 FALSE, return a reference to the original buffer instead
8494 (to save space, not time) */
8495 Py_INCREF(self);
8496 Py_DECREF(u);
8497 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 else if (maxchar_new == maxchar_old) {
8500 return u;
8501 }
8502 else {
8503 /* In case the maximum character changed, we need to
8504 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008505 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008506 if (v == NULL) {
8507 Py_DECREF(u);
8508 return NULL;
8509 }
8510 if (maxchar_new > maxchar_old) {
8511 /* If the maxchar increased so that the kind changed, not all
8512 characters are representable anymore and we need to fix the
8513 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008514 if (PyUnicode_CopyCharacters(v, 0,
8515 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008516 PyUnicode_GET_LENGTH(self)) < 0)
8517 {
8518 Py_DECREF(u);
8519 return NULL;
8520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 maxchar_old = fixfct((PyUnicodeObject*)v);
8522 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8523 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008524 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008525 if (PyUnicode_CopyCharacters(v, 0,
8526 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008527 PyUnicode_GET_LENGTH(self)) < 0)
8528 {
8529 Py_DECREF(u);
8530 return NULL;
8531 }
8532 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533
8534 Py_DECREF(u);
8535 return v;
8536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537}
8538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008540fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 /* No need to call PyUnicode_READY(self) because this function is only
8543 called as a callback from fixup() which does it already. */
8544 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8545 const int kind = PyUnicode_KIND(self);
8546 void *data = PyUnicode_DATA(self);
8547 int touched = 0;
8548 Py_UCS4 maxchar = 0;
8549 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 for (i = 0; i < len; ++i) {
8552 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8553 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8554 if (up != ch) {
8555 if (up > maxchar)
8556 maxchar = up;
8557 PyUnicode_WRITE(kind, data, i, up);
8558 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 else if (ch > maxchar)
8561 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562 }
8563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 if (touched)
8565 return maxchar;
8566 else
8567 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568}
8569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8574 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8575 const int kind = PyUnicode_KIND(self);
8576 void *data = PyUnicode_DATA(self);
8577 int touched = 0;
8578 Py_UCS4 maxchar = 0;
8579 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 for(i = 0; i < len; ++i) {
8582 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8583 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8584 if (lo != ch) {
8585 if (lo > maxchar)
8586 maxchar = lo;
8587 PyUnicode_WRITE(kind, data, i, lo);
8588 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 else if (ch > maxchar)
8591 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 }
8593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 if (touched)
8595 return maxchar;
8596 else
8597 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598}
8599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008601fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8604 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8605 const int kind = PyUnicode_KIND(self);
8606 void *data = PyUnicode_DATA(self);
8607 int touched = 0;
8608 Py_UCS4 maxchar = 0;
8609 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 for(i = 0; i < len; ++i) {
8612 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8613 Py_UCS4 nu = 0;
8614
8615 if (Py_UNICODE_ISUPPER(ch))
8616 nu = Py_UNICODE_TOLOWER(ch);
8617 else if (Py_UNICODE_ISLOWER(ch))
8618 nu = Py_UNICODE_TOUPPER(ch);
8619
8620 if (nu != 0) {
8621 if (nu > maxchar)
8622 maxchar = nu;
8623 PyUnicode_WRITE(kind, data, i, nu);
8624 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 else if (ch > maxchar)
8627 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 }
8629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 if (touched)
8631 return maxchar;
8632 else
8633 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634}
8635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8640 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8641 const int kind = PyUnicode_KIND(self);
8642 void *data = PyUnicode_DATA(self);
8643 int touched = 0;
8644 Py_UCS4 maxchar = 0;
8645 Py_ssize_t i = 0;
8646 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008647
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008648 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650
8651 ch = PyUnicode_READ(kind, data, i);
8652 if (!Py_UNICODE_ISUPPER(ch)) {
8653 maxchar = Py_UNICODE_TOUPPER(ch);
8654 PyUnicode_WRITE(kind, data, i, maxchar);
8655 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 ++i;
8658 for(; i < len; ++i) {
8659 ch = PyUnicode_READ(kind, data, i);
8660 if (!Py_UNICODE_ISLOWER(ch)) {
8661 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8662 if (lo > maxchar)
8663 maxchar = lo;
8664 PyUnicode_WRITE(kind, data, i, lo);
8665 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 else if (ch > maxchar)
8668 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670
8671 if (touched)
8672 return maxchar;
8673 else
8674 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675}
8676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008678fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8681 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8682 const int kind = PyUnicode_KIND(self);
8683 void *data = PyUnicode_DATA(self);
8684 Py_UCS4 maxchar = 0;
8685 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 int previous_is_cased;
8687
8688 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 if (len == 1) {
8690 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8691 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8692 if (ti != ch) {
8693 PyUnicode_WRITE(kind, data, i, ti);
8694 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 }
8696 else
8697 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 for(; i < len; ++i) {
8701 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8702 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008703
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 nu = Py_UNICODE_TOTITLE(ch);
8708
8709 if (nu > maxchar)
8710 maxchar = nu;
8711 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008712
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 if (Py_UNICODE_ISLOWER(ch) ||
8714 Py_UNICODE_ISUPPER(ch) ||
8715 Py_UNICODE_ISTITLE(ch))
8716 previous_is_cased = 1;
8717 else
8718 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721}
8722
Tim Peters8ce9f162004-08-27 01:49:32 +00008723PyObject *
8724PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008727 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008729 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008730 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8731 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008732 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 Py_ssize_t sz, i, res_offset;
8734 Py_UCS4 maxchar = 0;
8735 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736
Tim Peters05eba1f2004-08-27 21:32:02 +00008737 fseq = PySequence_Fast(seq, "");
8738 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008739 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008740 }
8741
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008742 /* NOTE: the following code can't call back into Python code,
8743 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008744 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008745
Tim Peters05eba1f2004-08-27 21:32:02 +00008746 seqlen = PySequence_Fast_GET_SIZE(fseq);
8747 /* If empty sequence, return u"". */
8748 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008750 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008751 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008752 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008753 /* If singleton sequence with an exact Unicode, return that. */
8754 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 item = items[0];
8756 if (PyUnicode_CheckExact(item)) {
8757 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 goto Done;
8760 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008761 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008762 else {
8763 /* Set up sep and seplen */
8764 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 /* fall back to a blank space separator */
8766 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008767 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008769 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008770 else {
8771 if (!PyUnicode_Check(separator)) {
8772 PyErr_Format(PyExc_TypeError,
8773 "separator: expected str instance,"
8774 " %.80s found",
8775 Py_TYPE(separator)->tp_name);
8776 goto onError;
8777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 if (PyUnicode_READY(separator) == -1)
8779 goto onError;
8780 sep = separator;
8781 seplen = PyUnicode_GET_LENGTH(separator);
8782 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8783 /* inc refcount to keep this code path symetric with the
8784 above case of a blank separator */
8785 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008786 }
8787 }
8788
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008789 /* There are at least two things to join, or else we have a subclass
8790 * of str in the sequence.
8791 * Do a pre-pass to figure out the total amount of space we'll
8792 * need (sz), and see whether all argument are strings.
8793 */
8794 sz = 0;
8795 for (i = 0; i < seqlen; i++) {
8796 const Py_ssize_t old_sz = sz;
8797 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 if (!PyUnicode_Check(item)) {
8799 PyErr_Format(PyExc_TypeError,
8800 "sequence item %zd: expected str instance,"
8801 " %.80s found",
8802 i, Py_TYPE(item)->tp_name);
8803 goto onError;
8804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 if (PyUnicode_READY(item) == -1)
8806 goto onError;
8807 sz += PyUnicode_GET_LENGTH(item);
8808 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8809 if (item_maxchar > maxchar)
8810 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008811 if (i != 0)
8812 sz += seplen;
8813 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8814 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008816 goto onError;
8817 }
8818 }
Tim Petersced69f82003-09-16 20:30:58 +00008819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008821 if (res == NULL)
8822 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008823
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008824 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008826 Py_ssize_t itemlen;
8827 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 /* Copy item, and maybe the separator. */
8830 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008831 if (PyUnicode_CopyCharacters(res, res_offset,
8832 sep, 0, seplen) < 0)
8833 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008836 if (PyUnicode_CopyCharacters(res, res_offset,
8837 item, 0, itemlen) < 0)
8838 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008842
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008844 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 Py_XDECREF(sep);
8846 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008849 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008851 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 return NULL;
8853}
8854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855#define FILL(kind, data, value, start, length) \
8856 do { \
8857 Py_ssize_t i_ = 0; \
8858 assert(kind != PyUnicode_WCHAR_KIND); \
8859 switch ((kind)) { \
8860 case PyUnicode_1BYTE_KIND: { \
8861 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8862 memset(to_, (unsigned char)value, length); \
8863 break; \
8864 } \
8865 case PyUnicode_2BYTE_KIND: { \
8866 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8867 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8868 break; \
8869 } \
8870 default: { \
8871 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8872 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8873 break; \
8874 } \
8875 } \
8876 } while (0)
8877
Alexander Belopolsky40018472011-02-26 01:02:56 +00008878static PyUnicodeObject *
8879pad(PyUnicodeObject *self,
8880 Py_ssize_t left,
8881 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 PyObject *u;
8885 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008886 int kind;
8887 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888
8889 if (left < 0)
8890 left = 0;
8891 if (right < 0)
8892 right = 0;
8893
Tim Peters7a29bd52001-09-12 03:03:31 +00008894 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 Py_INCREF(self);
8896 return self;
8897 }
8898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8900 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008901 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8902 return NULL;
8903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8905 if (fill > maxchar)
8906 maxchar = fill;
8907 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008908 if (!u)
8909 return NULL;
8910
8911 kind = PyUnicode_KIND(u);
8912 data = PyUnicode_DATA(u);
8913 if (left)
8914 FILL(kind, data, fill, 0, left);
8915 if (right)
8916 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008917 if (PyUnicode_CopyCharacters(u, left,
8918 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008919 _PyUnicode_LENGTH(self)) < 0)
8920 {
8921 Py_DECREF(u);
8922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923 }
8924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928
Alexander Belopolsky40018472011-02-26 01:02:56 +00008929PyObject *
8930PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933
8934 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 switch(PyUnicode_KIND(string)) {
8939 case PyUnicode_1BYTE_KIND:
8940 list = ucs1lib_splitlines(
8941 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8942 PyUnicode_GET_LENGTH(string), keepends);
8943 break;
8944 case PyUnicode_2BYTE_KIND:
8945 list = ucs2lib_splitlines(
8946 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8947 PyUnicode_GET_LENGTH(string), keepends);
8948 break;
8949 case PyUnicode_4BYTE_KIND:
8950 list = ucs4lib_splitlines(
8951 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8952 PyUnicode_GET_LENGTH(string), keepends);
8953 break;
8954 default:
8955 assert(0);
8956 list = 0;
8957 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 Py_DECREF(string);
8959 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
Alexander Belopolsky40018472011-02-26 01:02:56 +00008962static PyObject *
8963split(PyUnicodeObject *self,
8964 PyUnicodeObject *substring,
8965 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 int kind1, kind2, kind;
8968 void *buf1, *buf2;
8969 Py_ssize_t len1, len2;
8970 PyObject* out;
8971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008973 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 if (PyUnicode_READY(self) == -1)
8976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 if (substring == NULL)
8979 switch(PyUnicode_KIND(self)) {
8980 case PyUnicode_1BYTE_KIND:
8981 return ucs1lib_split_whitespace(
8982 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8983 PyUnicode_GET_LENGTH(self), maxcount
8984 );
8985 case PyUnicode_2BYTE_KIND:
8986 return ucs2lib_split_whitespace(
8987 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8988 PyUnicode_GET_LENGTH(self), maxcount
8989 );
8990 case PyUnicode_4BYTE_KIND:
8991 return ucs4lib_split_whitespace(
8992 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8993 PyUnicode_GET_LENGTH(self), maxcount
8994 );
8995 default:
8996 assert(0);
8997 return NULL;
8998 }
8999
9000 if (PyUnicode_READY(substring) == -1)
9001 return NULL;
9002
9003 kind1 = PyUnicode_KIND(self);
9004 kind2 = PyUnicode_KIND(substring);
9005 kind = kind1 > kind2 ? kind1 : kind2;
9006 buf1 = PyUnicode_DATA(self);
9007 buf2 = PyUnicode_DATA(substring);
9008 if (kind1 != kind)
9009 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9010 if (!buf1)
9011 return NULL;
9012 if (kind2 != kind)
9013 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9014 if (!buf2) {
9015 if (kind1 != kind) PyMem_Free(buf1);
9016 return NULL;
9017 }
9018 len1 = PyUnicode_GET_LENGTH(self);
9019 len2 = PyUnicode_GET_LENGTH(substring);
9020
9021 switch(kind) {
9022 case PyUnicode_1BYTE_KIND:
9023 out = ucs1lib_split(
9024 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9025 break;
9026 case PyUnicode_2BYTE_KIND:
9027 out = ucs2lib_split(
9028 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9029 break;
9030 case PyUnicode_4BYTE_KIND:
9031 out = ucs4lib_split(
9032 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9033 break;
9034 default:
9035 out = NULL;
9036 }
9037 if (kind1 != kind)
9038 PyMem_Free(buf1);
9039 if (kind2 != kind)
9040 PyMem_Free(buf2);
9041 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042}
9043
Alexander Belopolsky40018472011-02-26 01:02:56 +00009044static PyObject *
9045rsplit(PyUnicodeObject *self,
9046 PyUnicodeObject *substring,
9047 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009048{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 int kind1, kind2, kind;
9050 void *buf1, *buf2;
9051 Py_ssize_t len1, len2;
9052 PyObject* out;
9053
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009054 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009055 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (PyUnicode_READY(self) == -1)
9058 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (substring == NULL)
9061 switch(PyUnicode_KIND(self)) {
9062 case PyUnicode_1BYTE_KIND:
9063 return ucs1lib_rsplit_whitespace(
9064 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9065 PyUnicode_GET_LENGTH(self), maxcount
9066 );
9067 case PyUnicode_2BYTE_KIND:
9068 return ucs2lib_rsplit_whitespace(
9069 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9070 PyUnicode_GET_LENGTH(self), maxcount
9071 );
9072 case PyUnicode_4BYTE_KIND:
9073 return ucs4lib_rsplit_whitespace(
9074 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9075 PyUnicode_GET_LENGTH(self), maxcount
9076 );
9077 default:
9078 assert(0);
9079 return NULL;
9080 }
9081
9082 if (PyUnicode_READY(substring) == -1)
9083 return NULL;
9084
9085 kind1 = PyUnicode_KIND(self);
9086 kind2 = PyUnicode_KIND(substring);
9087 kind = kind1 > kind2 ? kind1 : kind2;
9088 buf1 = PyUnicode_DATA(self);
9089 buf2 = PyUnicode_DATA(substring);
9090 if (kind1 != kind)
9091 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9092 if (!buf1)
9093 return NULL;
9094 if (kind2 != kind)
9095 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9096 if (!buf2) {
9097 if (kind1 != kind) PyMem_Free(buf1);
9098 return NULL;
9099 }
9100 len1 = PyUnicode_GET_LENGTH(self);
9101 len2 = PyUnicode_GET_LENGTH(substring);
9102
9103 switch(kind) {
9104 case PyUnicode_1BYTE_KIND:
9105 out = ucs1lib_rsplit(
9106 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9107 break;
9108 case PyUnicode_2BYTE_KIND:
9109 out = ucs2lib_rsplit(
9110 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9111 break;
9112 case PyUnicode_4BYTE_KIND:
9113 out = ucs4lib_rsplit(
9114 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9115 break;
9116 default:
9117 out = NULL;
9118 }
9119 if (kind1 != kind)
9120 PyMem_Free(buf1);
9121 if (kind2 != kind)
9122 PyMem_Free(buf2);
9123 return out;
9124}
9125
9126static Py_ssize_t
9127anylib_find(int kind, void *buf1, Py_ssize_t len1,
9128 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9129{
9130 switch(kind) {
9131 case PyUnicode_1BYTE_KIND:
9132 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9133 case PyUnicode_2BYTE_KIND:
9134 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9135 case PyUnicode_4BYTE_KIND:
9136 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9137 }
9138 assert(0);
9139 return -1;
9140}
9141
9142static Py_ssize_t
9143anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9144 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9145{
9146 switch(kind) {
9147 case PyUnicode_1BYTE_KIND:
9148 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9149 case PyUnicode_2BYTE_KIND:
9150 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9151 case PyUnicode_4BYTE_KIND:
9152 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9153 }
9154 assert(0);
9155 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009156}
9157
Alexander Belopolsky40018472011-02-26 01:02:56 +00009158static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159replace(PyObject *self, PyObject *str1,
9160 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 PyObject *u;
9163 char *sbuf = PyUnicode_DATA(self);
9164 char *buf1 = PyUnicode_DATA(str1);
9165 char *buf2 = PyUnicode_DATA(str2);
9166 int srelease = 0, release1 = 0, release2 = 0;
9167 int skind = PyUnicode_KIND(self);
9168 int kind1 = PyUnicode_KIND(str1);
9169 int kind2 = PyUnicode_KIND(str2);
9170 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9171 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9172 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
9174 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009177 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 if (skind < kind1)
9180 /* substring too wide to be present */
9181 goto nothing;
9182
9183 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009184 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009185 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009187 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 Py_UCS4 u1, u2, maxchar;
9191 int mayshrink, rkind;
9192 u1 = PyUnicode_READ_CHAR(str1, 0);
9193 if (!findchar(sbuf, PyUnicode_KIND(self),
9194 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009195 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 u2 = PyUnicode_READ_CHAR(str2, 0);
9197 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9198 /* Replacing u1 with u2 may cause a maxchar reduction in the
9199 result string. */
9200 mayshrink = maxchar > 127;
9201 if (u2 > maxchar) {
9202 maxchar = u2;
9203 mayshrink = 0;
9204 }
9205 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009206 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009208 if (PyUnicode_CopyCharacters(u, 0,
9209 (PyObject*)self, 0, slen) < 0)
9210 {
9211 Py_DECREF(u);
9212 return NULL;
9213 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 rkind = PyUnicode_KIND(u);
9215 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9216 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009217 if (--maxcount < 0)
9218 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 if (mayshrink) {
9222 PyObject *tmp = u;
9223 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9224 PyUnicode_GET_LENGTH(tmp));
9225 Py_DECREF(tmp);
9226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 int rkind = skind;
9229 char *res;
9230 if (kind1 < rkind) {
9231 /* widen substring */
9232 buf1 = _PyUnicode_AsKind(str1, rkind);
9233 if (!buf1) goto error;
9234 release1 = 1;
9235 }
9236 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009237 if (i < 0)
9238 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 if (rkind > kind2) {
9240 /* widen replacement */
9241 buf2 = _PyUnicode_AsKind(str2, rkind);
9242 if (!buf2) goto error;
9243 release2 = 1;
9244 }
9245 else if (rkind < kind2) {
9246 /* widen self and buf1 */
9247 rkind = kind2;
9248 if (release1) PyMem_Free(buf1);
9249 sbuf = _PyUnicode_AsKind(self, rkind);
9250 if (!sbuf) goto error;
9251 srelease = 1;
9252 buf1 = _PyUnicode_AsKind(str1, rkind);
9253 if (!buf1) goto error;
9254 release1 = 1;
9255 }
9256 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9257 if (!res) {
9258 PyErr_NoMemory();
9259 goto error;
9260 }
9261 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009262 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9264 buf2,
9265 PyUnicode_KIND_SIZE(rkind, len2));
9266 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009267
9268 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9270 slen-i,
9271 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009272 if (i == -1)
9273 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9275 buf2,
9276 PyUnicode_KIND_SIZE(rkind, len2));
9277 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279
9280 u = PyUnicode_FromKindAndData(rkind, res, slen);
9281 PyMem_Free(res);
9282 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 Py_ssize_t n, i, j, ires;
9287 Py_ssize_t product, new_size;
9288 int rkind = skind;
9289 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 if (kind1 < rkind) {
9292 buf1 = _PyUnicode_AsKind(str1, rkind);
9293 if (!buf1) goto error;
9294 release1 = 1;
9295 }
9296 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009297 if (n == 0)
9298 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 if (kind2 < rkind) {
9300 buf2 = _PyUnicode_AsKind(str2, rkind);
9301 if (!buf2) goto error;
9302 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 else if (kind2 > rkind) {
9305 rkind = kind2;
9306 sbuf = _PyUnicode_AsKind(self, rkind);
9307 if (!sbuf) goto error;
9308 srelease = 1;
9309 if (release1) PyMem_Free(buf1);
9310 buf1 = _PyUnicode_AsKind(str1, rkind);
9311 if (!buf1) goto error;
9312 release1 = 1;
9313 }
9314 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9315 PyUnicode_GET_LENGTH(str1))); */
9316 product = n * (len2-len1);
9317 if ((product / (len2-len1)) != n) {
9318 PyErr_SetString(PyExc_OverflowError,
9319 "replace string is too long");
9320 goto error;
9321 }
9322 new_size = slen + product;
9323 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9324 PyErr_SetString(PyExc_OverflowError,
9325 "replace string is too long");
9326 goto error;
9327 }
9328 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9329 if (!res)
9330 goto error;
9331 ires = i = 0;
9332 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009333 while (n-- > 0) {
9334 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 j = anylib_find(rkind,
9336 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9337 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009338 if (j == -1)
9339 break;
9340 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009341 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9343 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9344 PyUnicode_KIND_SIZE(rkind, j-i));
9345 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009346 }
9347 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 if (len2 > 0) {
9349 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9350 buf2,
9351 PyUnicode_KIND_SIZE(rkind, len2));
9352 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009357 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9359 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9360 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009361 } else {
9362 /* interleave */
9363 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9365 buf2,
9366 PyUnicode_KIND_SIZE(rkind, len2));
9367 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009368 if (--n <= 0)
9369 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9371 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9372 PyUnicode_KIND_SIZE(rkind, 1));
9373 ires++;
9374 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9377 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9378 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009381 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 if (srelease)
9384 PyMem_FREE(sbuf);
9385 if (release1)
9386 PyMem_FREE(buf1);
9387 if (release2)
9388 PyMem_FREE(buf2);
9389 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009390
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009392 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 if (srelease)
9394 PyMem_FREE(sbuf);
9395 if (release1)
9396 PyMem_FREE(buf1);
9397 if (release2)
9398 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399 if (PyUnicode_CheckExact(self)) {
9400 Py_INCREF(self);
9401 return (PyObject *) self;
9402 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009403 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 error:
9405 if (srelease && sbuf)
9406 PyMem_FREE(sbuf);
9407 if (release1 && buf1)
9408 PyMem_FREE(buf1);
9409 if (release2 && buf2)
9410 PyMem_FREE(buf2);
9411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412}
9413
9414/* --- Unicode Object Methods --------------------------------------------- */
9415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009416PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418\n\
9419Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009420characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421
9422static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009423unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425 return fixup(self, fixtitle);
9426}
9427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009428PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430\n\
9431Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009432have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433
9434static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009435unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437 return fixup(self, fixcapitalize);
9438}
9439
9440#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009441PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443\n\
9444Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009445normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446
9447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009448unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449{
9450 PyObject *list;
9451 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009452 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 /* Split into words */
9455 list = split(self, NULL, -1);
9456 if (!list)
9457 return NULL;
9458
9459 /* Capitalize each word */
9460 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9461 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 if (item == NULL)
9464 goto onError;
9465 Py_DECREF(PyList_GET_ITEM(list, i));
9466 PyList_SET_ITEM(list, i, item);
9467 }
9468
9469 /* Join the words to form a new string */
9470 item = PyUnicode_Join(NULL, list);
9471
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 Py_DECREF(list);
9474 return (PyObject *)item;
9475}
9476#endif
9477
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009478/* Argument converter. Coerces to a single unicode character */
9479
9480static int
9481convert_uc(PyObject *obj, void *addr)
9482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009484 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009485
Benjamin Peterson14339b62009-01-31 16:36:08 +00009486 uniobj = PyUnicode_FromObject(obj);
9487 if (uniobj == NULL) {
9488 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009490 return 0;
9491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009493 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009495 Py_DECREF(uniobj);
9496 return 0;
9497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 Py_DECREF(uniobj);
9500 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009501}
9502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009503PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009506Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009507done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508
9509static PyObject *
9510unicode_center(PyUnicodeObject *self, PyObject *args)
9511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009512 Py_ssize_t marg, left;
9513 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 Py_UCS4 fillchar = ' ';
9515
Victor Stinnere9a29352011-10-01 02:14:59 +02009516 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518
Victor Stinnere9a29352011-10-01 02:14:59 +02009519 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520 return NULL;
9521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 Py_INCREF(self);
9524 return (PyObject*) self;
9525 }
9526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528 left = marg / 2 + (marg & width & 1);
9529
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009530 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531}
9532
Marc-André Lemburge5034372000-08-08 08:04:29 +00009533#if 0
9534
9535/* This code should go into some future Unicode collation support
9536 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009537 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009538
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009539/* speedy UTF-16 code point order comparison */
9540/* gleaned from: */
9541/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9542
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009543static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009544{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009545 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009546 0, 0, 0, 0, 0, 0, 0, 0,
9547 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009548 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009549};
9550
Guido van Rossumd57fd912000-03-10 22:53:23 +00009551static int
9552unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9553{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009554 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009555
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 Py_UNICODE *s1 = str1->str;
9557 Py_UNICODE *s2 = str2->str;
9558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 len1 = str1->_base._base.length;
9560 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009561
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009563 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009564
9565 c1 = *s1++;
9566 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009567
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 if (c1 > (1<<11) * 26)
9569 c1 += utf16Fixup[c1>>11];
9570 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009571 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009572 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009573
9574 if (c1 != c2)
9575 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009576
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009577 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578 }
9579
9580 return (len1 < len2) ? -1 : (len1 != len2);
9581}
9582
Marc-André Lemburge5034372000-08-08 08:04:29 +00009583#else
9584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585/* This function assumes that str1 and str2 are readied by the caller. */
9586
Marc-André Lemburge5034372000-08-08 08:04:29 +00009587static int
9588unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 int kind1, kind2;
9591 void *data1, *data2;
9592 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 kind1 = PyUnicode_KIND(str1);
9595 kind2 = PyUnicode_KIND(str2);
9596 data1 = PyUnicode_DATA(str1);
9597 data2 = PyUnicode_DATA(str2);
9598 len1 = PyUnicode_GET_LENGTH(str1);
9599 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 for (i = 0; i < len1 && i < len2; ++i) {
9602 Py_UCS4 c1, c2;
9603 c1 = PyUnicode_READ(kind1, data1, i);
9604 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009605
9606 if (c1 != c2)
9607 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009608 }
9609
9610 return (len1 < len2) ? -1 : (len1 != len2);
9611}
9612
9613#endif
9614
Alexander Belopolsky40018472011-02-26 01:02:56 +00009615int
9616PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9619 if (PyUnicode_READY(left) == -1 ||
9620 PyUnicode_READY(right) == -1)
9621 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009622 return unicode_compare((PyUnicodeObject *)left,
9623 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009625 PyErr_Format(PyExc_TypeError,
9626 "Can't compare %.100s and %.100s",
9627 left->ob_type->tp_name,
9628 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629 return -1;
9630}
9631
Martin v. Löwis5b222132007-06-10 09:51:05 +00009632int
9633PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 Py_ssize_t i;
9636 int kind;
9637 void *data;
9638 Py_UCS4 chr;
9639
Victor Stinner910337b2011-10-03 03:20:16 +02009640 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 if (PyUnicode_READY(uni) == -1)
9642 return -1;
9643 kind = PyUnicode_KIND(uni);
9644 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009645 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9647 if (chr != str[i])
9648 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009649 /* This check keeps Python strings that end in '\0' from comparing equal
9650 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009653 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009655 return 0;
9656}
9657
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009658
Benjamin Peterson29060642009-01-31 22:14:21 +00009659#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009660 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009661
Alexander Belopolsky40018472011-02-26 01:02:56 +00009662PyObject *
9663PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009664{
9665 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009666
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009667 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9668 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 if (PyUnicode_READY(left) == -1 ||
9670 PyUnicode_READY(right) == -1)
9671 return NULL;
9672 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9673 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009674 if (op == Py_EQ) {
9675 Py_INCREF(Py_False);
9676 return Py_False;
9677 }
9678 if (op == Py_NE) {
9679 Py_INCREF(Py_True);
9680 return Py_True;
9681 }
9682 }
9683 if (left == right)
9684 result = 0;
9685 else
9686 result = unicode_compare((PyUnicodeObject *)left,
9687 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009688
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009689 /* Convert the return value to a Boolean */
9690 switch (op) {
9691 case Py_EQ:
9692 v = TEST_COND(result == 0);
9693 break;
9694 case Py_NE:
9695 v = TEST_COND(result != 0);
9696 break;
9697 case Py_LE:
9698 v = TEST_COND(result <= 0);
9699 break;
9700 case Py_GE:
9701 v = TEST_COND(result >= 0);
9702 break;
9703 case Py_LT:
9704 v = TEST_COND(result == -1);
9705 break;
9706 case Py_GT:
9707 v = TEST_COND(result == 1);
9708 break;
9709 default:
9710 PyErr_BadArgument();
9711 return NULL;
9712 }
9713 Py_INCREF(v);
9714 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009716
Brian Curtindfc80e32011-08-10 20:28:54 -05009717 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009718}
9719
Alexander Belopolsky40018472011-02-26 01:02:56 +00009720int
9721PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009722{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009723 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 int kind1, kind2, kind;
9725 void *buf1, *buf2;
9726 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009727 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009728
9729 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009730 sub = PyUnicode_FromObject(element);
9731 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 PyErr_Format(PyExc_TypeError,
9733 "'in <string>' requires string as left operand, not %s",
9734 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009735 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 if (PyUnicode_READY(sub) == -1)
9738 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009739
Thomas Wouters477c8d52006-05-27 19:21:47 +00009740 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009741 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009742 Py_DECREF(sub);
9743 return -1;
9744 }
9745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 kind1 = PyUnicode_KIND(str);
9747 kind2 = PyUnicode_KIND(sub);
9748 kind = kind1 > kind2 ? kind1 : kind2;
9749 buf1 = PyUnicode_DATA(str);
9750 buf2 = PyUnicode_DATA(sub);
9751 if (kind1 != kind)
9752 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9753 if (!buf1) {
9754 Py_DECREF(sub);
9755 return -1;
9756 }
9757 if (kind2 != kind)
9758 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9759 if (!buf2) {
9760 Py_DECREF(sub);
9761 if (kind1 != kind) PyMem_Free(buf1);
9762 return -1;
9763 }
9764 len1 = PyUnicode_GET_LENGTH(str);
9765 len2 = PyUnicode_GET_LENGTH(sub);
9766
9767 switch(kind) {
9768 case PyUnicode_1BYTE_KIND:
9769 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9770 break;
9771 case PyUnicode_2BYTE_KIND:
9772 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9773 break;
9774 case PyUnicode_4BYTE_KIND:
9775 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9776 break;
9777 default:
9778 result = -1;
9779 assert(0);
9780 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009781
9782 Py_DECREF(str);
9783 Py_DECREF(sub);
9784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 if (kind1 != kind)
9786 PyMem_Free(buf1);
9787 if (kind2 != kind)
9788 PyMem_Free(buf2);
9789
Guido van Rossum403d68b2000-03-13 15:55:09 +00009790 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009791}
9792
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793/* Concat to string or Unicode object giving a new Unicode object. */
9794
Alexander Belopolsky40018472011-02-26 01:02:56 +00009795PyObject *
9796PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009798 PyObject *u = NULL, *v = NULL, *w;
9799 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800
9801 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009804 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808
9809 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009810 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009814 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009815 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817 }
9818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009820 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 w = PyUnicode_New(
9824 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9825 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009827 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009828 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9829 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009830 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009831 v, 0,
9832 PyUnicode_GET_LENGTH(v)) < 0)
9833 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834 Py_DECREF(u);
9835 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839 Py_XDECREF(u);
9840 Py_XDECREF(v);
9841 return NULL;
9842}
9843
Walter Dörwald1ab83302007-05-18 17:15:44 +00009844void
Victor Stinner23e56682011-10-03 03:54:37 +02009845PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009846{
Victor Stinner23e56682011-10-03 03:54:37 +02009847 PyObject *left, *res;
9848
9849 if (p_left == NULL) {
9850 if (!PyErr_Occurred())
9851 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009852 return;
9853 }
Victor Stinner23e56682011-10-03 03:54:37 +02009854 left = *p_left;
9855 if (right == NULL || !PyUnicode_Check(left)) {
9856 if (!PyErr_Occurred())
9857 PyErr_BadInternalCall();
9858 goto error;
9859 }
9860
9861 if (PyUnicode_CheckExact(left) && left != unicode_empty
9862 && PyUnicode_CheckExact(right) && right != unicode_empty
9863 && unicode_resizable(left)
9864 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9865 || _PyUnicode_WSTR(left) != NULL))
9866 {
9867 Py_ssize_t u_len, v_len, new_len, copied;
9868
9869 /* FIXME: don't make wstr string ready */
9870 if (PyUnicode_READY(left))
9871 goto error;
9872 if (PyUnicode_READY(right))
9873 goto error;
9874
9875 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9876 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9877 {
9878 u_len = PyUnicode_GET_LENGTH(left);
9879 v_len = PyUnicode_GET_LENGTH(right);
9880 if (u_len > PY_SSIZE_T_MAX - v_len) {
9881 PyErr_SetString(PyExc_OverflowError,
9882 "strings are too large to concat");
9883 goto error;
9884 }
9885 new_len = u_len + v_len;
9886
9887 /* Now we own the last reference to 'left', so we can resize it
9888 * in-place.
9889 */
9890 if (unicode_resize(&left, new_len) != 0) {
9891 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9892 * deallocated so it cannot be put back into
9893 * 'variable'. The MemoryError is raised when there
9894 * is no value in 'variable', which might (very
9895 * remotely) be a cause of incompatibilities.
9896 */
9897 goto error;
9898 }
9899 /* copy 'right' into the newly allocated area of 'left' */
9900 copied = PyUnicode_CopyCharacters(left, u_len,
9901 right, 0,
9902 v_len);
9903 assert(0 <= copied);
9904 *p_left = left;
9905 return;
9906 }
9907 }
9908
9909 res = PyUnicode_Concat(left, right);
9910 if (res == NULL)
9911 goto error;
9912 Py_DECREF(left);
9913 *p_left = res;
9914 return;
9915
9916error:
9917 Py_DECREF(*p_left);
9918 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009919}
9920
9921void
9922PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 PyUnicode_Append(pleft, right);
9925 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009926}
9927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009928PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009931Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009932string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009933interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934
9935static PyObject *
9936unicode_count(PyUnicodeObject *self, PyObject *args)
9937{
9938 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009939 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009940 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 int kind1, kind2, kind;
9943 void *buf1, *buf2;
9944 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945
Jesus Ceaac451502011-04-20 17:09:23 +02009946 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9947 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009948 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 kind1 = PyUnicode_KIND(self);
9951 kind2 = PyUnicode_KIND(substring);
9952 kind = kind1 > kind2 ? kind1 : kind2;
9953 buf1 = PyUnicode_DATA(self);
9954 buf2 = PyUnicode_DATA(substring);
9955 if (kind1 != kind)
9956 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9957 if (!buf1) {
9958 Py_DECREF(substring);
9959 return NULL;
9960 }
9961 if (kind2 != kind)
9962 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9963 if (!buf2) {
9964 Py_DECREF(substring);
9965 if (kind1 != kind) PyMem_Free(buf1);
9966 return NULL;
9967 }
9968 len1 = PyUnicode_GET_LENGTH(self);
9969 len2 = PyUnicode_GET_LENGTH(substring);
9970
9971 ADJUST_INDICES(start, end, len1);
9972 switch(kind) {
9973 case PyUnicode_1BYTE_KIND:
9974 iresult = ucs1lib_count(
9975 ((Py_UCS1*)buf1) + start, end - start,
9976 buf2, len2, PY_SSIZE_T_MAX
9977 );
9978 break;
9979 case PyUnicode_2BYTE_KIND:
9980 iresult = ucs2lib_count(
9981 ((Py_UCS2*)buf1) + start, end - start,
9982 buf2, len2, PY_SSIZE_T_MAX
9983 );
9984 break;
9985 case PyUnicode_4BYTE_KIND:
9986 iresult = ucs4lib_count(
9987 ((Py_UCS4*)buf1) + start, end - start,
9988 buf2, len2, PY_SSIZE_T_MAX
9989 );
9990 break;
9991 default:
9992 assert(0); iresult = 0;
9993 }
9994
9995 result = PyLong_FromSsize_t(iresult);
9996
9997 if (kind1 != kind)
9998 PyMem_Free(buf1);
9999 if (kind2 != kind)
10000 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
10002 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010003
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 return result;
10005}
10006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010007PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010008 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010010Encode S using the codec registered for encoding. Default encoding\n\
10011is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010012handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010013a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10014'xmlcharrefreplace' as well as any other name registered with\n\
10015codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016
10017static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010018unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010020 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021 char *encoding = NULL;
10022 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010023
Benjamin Peterson308d6372009-09-18 21:42:35 +000010024 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10025 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010027 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010028}
10029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010030PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010031 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032\n\
10033Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010034If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
10036static PyObject*
10037unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10038{
10039 Py_UNICODE *e;
10040 Py_UNICODE *p;
10041 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010042 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 PyUnicodeObject *u;
10045 int tabsize = 8;
10046
10047 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10051 return NULL;
10052
Thomas Wouters7e474022000-07-16 12:04:32 +000010053 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010054 i = 0; /* chars up to and including most recent \n or \r */
10055 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10057 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010059 if (tabsize > 0) {
10060 incr = tabsize - (j % tabsize); /* cannot overflow */
10061 if (j > PY_SSIZE_T_MAX - incr)
10062 goto overflow1;
10063 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010064 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 if (j > PY_SSIZE_T_MAX - 1)
10068 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 j++;
10070 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 if (i > PY_SSIZE_T_MAX - j)
10072 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010074 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 }
10076 }
10077
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010078 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010079 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010080
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081 /* Second pass: create output string and fill it */
10082 u = _PyUnicode_New(i + j);
10083 if (!u)
10084 return NULL;
10085
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010086 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 q = _PyUnicode_WSTR(u); /* next output char */
10088 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010092 if (tabsize > 0) {
10093 i = tabsize - (j % tabsize);
10094 j += i;
10095 while (i--) {
10096 if (q >= qe)
10097 goto overflow2;
10098 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010099 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010100 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010101 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 else {
10103 if (q >= qe)
10104 goto overflow2;
10105 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010106 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107 if (*p == '\n' || *p == '\r')
10108 j = 0;
10109 }
10110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 if (PyUnicode_READY(u) == -1) {
10112 Py_DECREF(u);
10113 return NULL;
10114 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010116
10117 overflow2:
10118 Py_DECREF(u);
10119 overflow1:
10120 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122}
10123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010124PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010125 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126\n\
10127Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010128such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129arguments start and end are interpreted as in slice notation.\n\
10130\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010131Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
10133static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135{
Jesus Ceaac451502011-04-20 17:09:23 +020010136 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010137 Py_ssize_t start;
10138 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Jesus Ceaac451502011-04-20 17:09:23 +020010141 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10142 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (PyUnicode_READY(self) == -1)
10146 return NULL;
10147 if (PyUnicode_READY(substring) == -1)
10148 return NULL;
10149
10150 result = any_find_slice(
10151 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10152 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010153 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154
10155 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 if (result == -2)
10158 return NULL;
10159
Christian Heimes217cfd12007-12-02 14:31:20 +000010160 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161}
10162
10163static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010164unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010166 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10167 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170}
10171
Guido van Rossumc2504932007-09-18 19:42:40 +000010172/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010173 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010174static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010175unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176{
Guido van Rossumc2504932007-09-18 19:42:40 +000010177 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010178 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (_PyUnicode_HASH(self) != -1)
10181 return _PyUnicode_HASH(self);
10182 if (PyUnicode_READY(self) == -1)
10183 return -1;
10184 len = PyUnicode_GET_LENGTH(self);
10185
10186 /* The hash function as a macro, gets expanded three times below. */
10187#define HASH(P) \
10188 x = (Py_uhash_t)*P << 7; \
10189 while (--len >= 0) \
10190 x = (1000003*x) ^ (Py_uhash_t)*P++;
10191
10192 switch (PyUnicode_KIND(self)) {
10193 case PyUnicode_1BYTE_KIND: {
10194 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10195 HASH(c);
10196 break;
10197 }
10198 case PyUnicode_2BYTE_KIND: {
10199 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10200 HASH(s);
10201 break;
10202 }
10203 default: {
10204 Py_UCS4 *l;
10205 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10206 "Impossible switch case in unicode_hash");
10207 l = PyUnicode_4BYTE_DATA(self);
10208 HASH(l);
10209 break;
10210 }
10211 }
10212 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10213
Guido van Rossumc2504932007-09-18 19:42:40 +000010214 if (x == -1)
10215 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010216 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010217 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010221PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010222 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010224Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
10226static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010229 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010230 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010231 Py_ssize_t start;
10232 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233
Jesus Ceaac451502011-04-20 17:09:23 +020010234 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10235 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 if (PyUnicode_READY(self) == -1)
10239 return NULL;
10240 if (PyUnicode_READY(substring) == -1)
10241 return NULL;
10242
10243 result = any_find_slice(
10244 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10245 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010246 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
10248 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (result == -2)
10251 return NULL;
10252
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 if (result < 0) {
10254 PyErr_SetString(PyExc_ValueError, "substring not found");
10255 return NULL;
10256 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257
Christian Heimes217cfd12007-12-02 14:31:20 +000010258 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259}
10260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010261PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010264Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010265at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
10267static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010268unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 Py_ssize_t i, length;
10271 int kind;
10272 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273 int cased;
10274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (PyUnicode_READY(self) == -1)
10276 return NULL;
10277 length = PyUnicode_GET_LENGTH(self);
10278 kind = PyUnicode_KIND(self);
10279 data = PyUnicode_DATA(self);
10280
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (length == 1)
10283 return PyBool_FromLong(
10284 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010286 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010289
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 for (i = 0; i < length; i++) {
10292 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010293
Benjamin Peterson29060642009-01-31 22:14:21 +000010294 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10295 return PyBool_FromLong(0);
10296 else if (!cased && Py_UNICODE_ISLOWER(ch))
10297 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010299 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300}
10301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010302PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010305Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010306at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307
10308static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010309unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 Py_ssize_t i, length;
10312 int kind;
10313 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314 int cased;
10315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (PyUnicode_READY(self) == -1)
10317 return NULL;
10318 length = PyUnicode_GET_LENGTH(self);
10319 kind = PyUnicode_KIND(self);
10320 data = PyUnicode_DATA(self);
10321
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (length == 1)
10324 return PyBool_FromLong(
10325 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010327 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010330
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 for (i = 0; i < length; i++) {
10333 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010334
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10336 return PyBool_FromLong(0);
10337 else if (!cased && Py_UNICODE_ISUPPER(ch))
10338 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010340 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341}
10342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010346Return True if S is a titlecased string and there is at least one\n\
10347character in S, i.e. upper- and titlecase characters may only\n\
10348follow uncased characters and lowercase characters only cased ones.\n\
10349Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350
10351static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010352unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 Py_ssize_t i, length;
10355 int kind;
10356 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 int cased, previous_is_cased;
10358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (PyUnicode_READY(self) == -1)
10360 return NULL;
10361 length = PyUnicode_GET_LENGTH(self);
10362 kind = PyUnicode_KIND(self);
10363 data = PyUnicode_DATA(self);
10364
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 if (length == 1) {
10367 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10368 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10369 (Py_UNICODE_ISUPPER(ch) != 0));
10370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010372 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010374 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010375
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 cased = 0;
10377 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 for (i = 0; i < length; i++) {
10379 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010380
Benjamin Peterson29060642009-01-31 22:14:21 +000010381 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10382 if (previous_is_cased)
10383 return PyBool_FromLong(0);
10384 previous_is_cased = 1;
10385 cased = 1;
10386 }
10387 else if (Py_UNICODE_ISLOWER(ch)) {
10388 if (!previous_is_cased)
10389 return PyBool_FromLong(0);
10390 previous_is_cased = 1;
10391 cased = 1;
10392 }
10393 else
10394 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010396 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010399PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010400 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010402Return True if all characters in S are whitespace\n\
10403and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404
10405static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010406unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 Py_ssize_t i, length;
10409 int kind;
10410 void *data;
10411
10412 if (PyUnicode_READY(self) == -1)
10413 return NULL;
10414 length = PyUnicode_GET_LENGTH(self);
10415 kind = PyUnicode_KIND(self);
10416 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (length == 1)
10420 return PyBool_FromLong(
10421 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010423 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010425 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 for (i = 0; i < length; i++) {
10428 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010429 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010432 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433}
10434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010435PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010436 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010437\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010438Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010439and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010440
10441static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010442unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 Py_ssize_t i, length;
10445 int kind;
10446 void *data;
10447
10448 if (PyUnicode_READY(self) == -1)
10449 return NULL;
10450 length = PyUnicode_GET_LENGTH(self);
10451 kind = PyUnicode_KIND(self);
10452 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010453
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010454 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 if (length == 1)
10456 return PyBool_FromLong(
10457 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010458
10459 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 for (i = 0; i < length; i++) {
10464 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010466 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010467 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010468}
10469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010470PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010472\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010473Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010474and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010475
10476static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010477unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 int kind;
10480 void *data;
10481 Py_ssize_t len, i;
10482
10483 if (PyUnicode_READY(self) == -1)
10484 return NULL;
10485
10486 kind = PyUnicode_KIND(self);
10487 data = PyUnicode_DATA(self);
10488 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010489
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010490 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (len == 1) {
10492 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10493 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10494 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010495
10496 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 for (i = 0; i < len; i++) {
10501 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010502 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010505 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010506}
10507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010508PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010509 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010511Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010512False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513
10514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010515unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 Py_ssize_t i, length;
10518 int kind;
10519 void *data;
10520
10521 if (PyUnicode_READY(self) == -1)
10522 return NULL;
10523 length = PyUnicode_GET_LENGTH(self);
10524 kind = PyUnicode_KIND(self);
10525 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (length == 1)
10529 return PyBool_FromLong(
10530 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010532 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 for (i = 0; i < length; i++) {
10537 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010540 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541}
10542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010543PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010546Return True if all characters in S are digits\n\
10547and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548
10549static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010550unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 Py_ssize_t i, length;
10553 int kind;
10554 void *data;
10555
10556 if (PyUnicode_READY(self) == -1)
10557 return NULL;
10558 length = PyUnicode_GET_LENGTH(self);
10559 kind = PyUnicode_KIND(self);
10560 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (length == 1) {
10564 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10565 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010568 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 for (i = 0; i < length; i++) {
10573 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010576 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577}
10578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010579PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010582Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010583False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
10585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010586unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 Py_ssize_t i, length;
10589 int kind;
10590 void *data;
10591
10592 if (PyUnicode_READY(self) == -1)
10593 return NULL;
10594 length = PyUnicode_GET_LENGTH(self);
10595 kind = PyUnicode_KIND(self);
10596 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (length == 1)
10600 return PyBool_FromLong(
10601 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010603 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010605 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 for (i = 0; i < length; i++) {
10608 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010609 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010611 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612}
10613
Martin v. Löwis47383402007-08-15 07:32:56 +000010614int
10615PyUnicode_IsIdentifier(PyObject *self)
10616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 int kind;
10618 void *data;
10619 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010620 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (PyUnicode_READY(self) == -1) {
10623 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 }
10626
10627 /* Special case for empty strings */
10628 if (PyUnicode_GET_LENGTH(self) == 0)
10629 return 0;
10630 kind = PyUnicode_KIND(self);
10631 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010632
10633 /* PEP 3131 says that the first character must be in
10634 XID_Start and subsequent characters in XID_Continue,
10635 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010636 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010637 letters, digits, underscore). However, given the current
10638 definition of XID_Start and XID_Continue, it is sufficient
10639 to check just for these, except that _ must be allowed
10640 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010642 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010643 return 0;
10644
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010645 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010648 return 1;
10649}
10650
10651PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010653\n\
10654Return True if S is a valid identifier according\n\
10655to the language definition.");
10656
10657static PyObject*
10658unicode_isidentifier(PyObject *self)
10659{
10660 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10661}
10662
Georg Brandl559e5d72008-06-11 18:37:52 +000010663PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010664 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010665\n\
10666Return True if all characters in S are considered\n\
10667printable in repr() or S is empty, False otherwise.");
10668
10669static PyObject*
10670unicode_isprintable(PyObject *self)
10671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 Py_ssize_t i, length;
10673 int kind;
10674 void *data;
10675
10676 if (PyUnicode_READY(self) == -1)
10677 return NULL;
10678 length = PyUnicode_GET_LENGTH(self);
10679 kind = PyUnicode_KIND(self);
10680 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010681
10682 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 if (length == 1)
10684 return PyBool_FromLong(
10685 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 for (i = 0; i < length; i++) {
10688 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010689 Py_RETURN_FALSE;
10690 }
10691 }
10692 Py_RETURN_TRUE;
10693}
10694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010695PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010696 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697\n\
10698Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010699iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700
10701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010702unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010704 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705}
10706
Martin v. Löwis18e16552006-02-15 17:27:45 +000010707static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708unicode_length(PyUnicodeObject *self)
10709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 if (PyUnicode_READY(self) == -1)
10711 return -1;
10712 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713}
10714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010715PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010716 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010718Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010719done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720
10721static PyObject *
10722unicode_ljust(PyUnicodeObject *self, PyObject *args)
10723{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010724 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 Py_UCS4 fillchar = ' ';
10726
10727 if (PyUnicode_READY(self) == -1)
10728 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010729
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010730 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 return NULL;
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 Py_INCREF(self);
10735 return (PyObject*) self;
10736 }
10737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739}
10740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010741PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010744Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745
10746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010747unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 return fixup(self, fixlower);
10750}
10751
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010752#define LEFTSTRIP 0
10753#define RIGHTSTRIP 1
10754#define BOTHSTRIP 2
10755
10756/* Arrays indexed by above */
10757static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10758
10759#define STRIPNAME(i) (stripformat[i]+3)
10760
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010761/* externally visible for str.strip(unicode) */
10762PyObject *
10763_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 void *data;
10766 int kind;
10767 Py_ssize_t i, j, len;
10768 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10771 return NULL;
10772
10773 kind = PyUnicode_KIND(self);
10774 data = PyUnicode_DATA(self);
10775 len = PyUnicode_GET_LENGTH(self);
10776 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10777 PyUnicode_DATA(sepobj),
10778 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010779
Benjamin Peterson14339b62009-01-31 16:36:08 +000010780 i = 0;
10781 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 while (i < len &&
10783 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 i++;
10785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010787
Benjamin Peterson14339b62009-01-31 16:36:08 +000010788 j = len;
10789 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 do {
10791 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 } while (j >= i &&
10793 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010796
Victor Stinner12bab6d2011-10-01 01:53:49 +020010797 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798}
10799
10800PyObject*
10801PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10802{
10803 unsigned char *data;
10804 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010805 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806
Victor Stinnerde636f32011-10-01 03:55:54 +020010807 if (PyUnicode_READY(self) == -1)
10808 return NULL;
10809
10810 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10811
Victor Stinner12bab6d2011-10-01 01:53:49 +020010812 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010814 if (PyUnicode_CheckExact(self)) {
10815 Py_INCREF(self);
10816 return self;
10817 }
10818 else
10819 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 }
10821
Victor Stinner12bab6d2011-10-01 01:53:49 +020010822 length = end - start;
10823 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010824 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825
Victor Stinnerde636f32011-10-01 03:55:54 +020010826 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010827 PyErr_SetString(PyExc_IndexError, "string index out of range");
10828 return NULL;
10829 }
10830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 kind = PyUnicode_KIND(self);
10832 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010833 return PyUnicode_FromKindAndData(kind,
10834 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010835 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
10838static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010839do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 int kind;
10842 void *data;
10843 Py_ssize_t len, i, j;
10844
10845 if (PyUnicode_READY(self) == -1)
10846 return NULL;
10847
10848 kind = PyUnicode_KIND(self);
10849 data = PyUnicode_DATA(self);
10850 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010851
Benjamin Peterson14339b62009-01-31 16:36:08 +000010852 i = 0;
10853 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010855 i++;
10856 }
10857 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010858
Benjamin Peterson14339b62009-01-31 16:36:08 +000010859 j = len;
10860 if (striptype != LEFTSTRIP) {
10861 do {
10862 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010864 j++;
10865 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010866
Victor Stinner12bab6d2011-10-01 01:53:49 +020010867 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868}
10869
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010870
10871static PyObject *
10872do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10873{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010874 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010875
Benjamin Peterson14339b62009-01-31 16:36:08 +000010876 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10877 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010878
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 if (sep != NULL && sep != Py_None) {
10880 if (PyUnicode_Check(sep))
10881 return _PyUnicode_XStrip(self, striptype, sep);
10882 else {
10883 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 "%s arg must be None or str",
10885 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 return NULL;
10887 }
10888 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010889
Benjamin Peterson14339b62009-01-31 16:36:08 +000010890 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010891}
10892
10893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010894PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010895 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010896\n\
10897Return a copy of the string S with leading and trailing\n\
10898whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010899If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010900
10901static PyObject *
10902unicode_strip(PyUnicodeObject *self, PyObject *args)
10903{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010904 if (PyTuple_GET_SIZE(args) == 0)
10905 return do_strip(self, BOTHSTRIP); /* Common case */
10906 else
10907 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010908}
10909
10910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010911PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010913\n\
10914Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010915If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010916
10917static PyObject *
10918unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10919{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010920 if (PyTuple_GET_SIZE(args) == 0)
10921 return do_strip(self, LEFTSTRIP); /* Common case */
10922 else
10923 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010924}
10925
10926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010927PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010928 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010929\n\
10930Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010931If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010932
10933static PyObject *
10934unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10935{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010936 if (PyTuple_GET_SIZE(args) == 0)
10937 return do_strip(self, RIGHTSTRIP); /* Common case */
10938 else
10939 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010940}
10941
10942
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010944unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945{
10946 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
Georg Brandl222de0f2009-04-12 12:01:50 +000010949 if (len < 1) {
10950 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010951 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010952 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Tim Peters7a29bd52001-09-12 03:03:31 +000010954 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 /* no repeat, return original string */
10956 Py_INCREF(str);
10957 return (PyObject*) str;
10958 }
Tim Peters8f422462000-09-09 06:13:41 +000010959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 if (PyUnicode_READY(str) == -1)
10961 return NULL;
10962
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010963 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010964 PyErr_SetString(PyExc_OverflowError,
10965 "repeated string is too long");
10966 return NULL;
10967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 if (!u)
10972 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010973 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (PyUnicode_GET_LENGTH(str) == 1) {
10976 const int kind = PyUnicode_KIND(str);
10977 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10978 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010979 if (kind == PyUnicode_1BYTE_KIND)
10980 memset(to, (unsigned char)fill_char, len);
10981 else {
10982 for (n = 0; n < len; ++n)
10983 PyUnicode_WRITE(kind, to, n, fill_char);
10984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 }
10986 else {
10987 /* number of characters copied this far */
10988 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10989 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10990 char *to = (char *) PyUnicode_DATA(u);
10991 Py_MEMCPY(to, PyUnicode_DATA(str),
10992 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 n = (done <= nchars-done) ? done : nchars-done;
10995 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010996 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 }
10999
11000 return (PyObject*) u;
11001}
11002
Alexander Belopolsky40018472011-02-26 01:02:56 +000011003PyObject *
11004PyUnicode_Replace(PyObject *obj,
11005 PyObject *subobj,
11006 PyObject *replobj,
11007 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008{
11009 PyObject *self;
11010 PyObject *str1;
11011 PyObject *str2;
11012 PyObject *result;
11013
11014 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011015 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011018 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 Py_DECREF(self);
11020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 }
11022 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011023 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 Py_DECREF(self);
11025 Py_DECREF(str1);
11026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 Py_DECREF(self);
11030 Py_DECREF(str1);
11031 Py_DECREF(str2);
11032 return result;
11033}
11034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011035PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011036 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037\n\
11038Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011039old replaced by new. If the optional argument count is\n\
11040given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041
11042static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 PyObject *str1;
11046 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011047 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 PyObject *result;
11049
Martin v. Löwis18e16552006-02-15 17:27:45 +000011050 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 str1 = PyUnicode_FromObject(str1);
11055 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11056 return NULL;
11057 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011058 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 Py_DECREF(str1);
11060 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
11063 result = replace(self, str1, str2, maxcount);
11064
11065 Py_DECREF(str1);
11066 Py_DECREF(str2);
11067 return result;
11068}
11069
Alexander Belopolsky40018472011-02-26 01:02:56 +000011070static PyObject *
11071unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011073 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 Py_ssize_t isize;
11075 Py_ssize_t osize, squote, dquote, i, o;
11076 Py_UCS4 max, quote;
11077 int ikind, okind;
11078 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011081 return NULL;
11082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 isize = PyUnicode_GET_LENGTH(unicode);
11084 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 /* Compute length of output, quote characters, and
11087 maximum character */
11088 osize = 2; /* quotes */
11089 max = 127;
11090 squote = dquote = 0;
11091 ikind = PyUnicode_KIND(unicode);
11092 for (i = 0; i < isize; i++) {
11093 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11094 switch (ch) {
11095 case '\'': squote++; osize++; break;
11096 case '"': dquote++; osize++; break;
11097 case '\\': case '\t': case '\r': case '\n':
11098 osize += 2; break;
11099 default:
11100 /* Fast-path ASCII */
11101 if (ch < ' ' || ch == 0x7f)
11102 osize += 4; /* \xHH */
11103 else if (ch < 0x7f)
11104 osize++;
11105 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11106 osize++;
11107 max = ch > max ? ch : max;
11108 }
11109 else if (ch < 0x100)
11110 osize += 4; /* \xHH */
11111 else if (ch < 0x10000)
11112 osize += 6; /* \uHHHH */
11113 else
11114 osize += 10; /* \uHHHHHHHH */
11115 }
11116 }
11117
11118 quote = '\'';
11119 if (squote) {
11120 if (dquote)
11121 /* Both squote and dquote present. Use squote,
11122 and escape them */
11123 osize += squote;
11124 else
11125 quote = '"';
11126 }
11127
11128 repr = PyUnicode_New(osize, max);
11129 if (repr == NULL)
11130 return NULL;
11131 okind = PyUnicode_KIND(repr);
11132 odata = PyUnicode_DATA(repr);
11133
11134 PyUnicode_WRITE(okind, odata, 0, quote);
11135 PyUnicode_WRITE(okind, odata, osize-1, quote);
11136
11137 for (i = 0, o = 1; i < isize; i++) {
11138 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011139
11140 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if ((ch == quote) || (ch == '\\')) {
11142 PyUnicode_WRITE(okind, odata, o++, '\\');
11143 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011144 continue;
11145 }
11146
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011148 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 PyUnicode_WRITE(okind, odata, o++, '\\');
11150 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011151 }
11152 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 PyUnicode_WRITE(okind, odata, o++, '\\');
11154 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011155 }
11156 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 PyUnicode_WRITE(okind, odata, o++, '\\');
11158 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011159 }
11160
11161 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011162 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 PyUnicode_WRITE(okind, odata, o++, '\\');
11164 PyUnicode_WRITE(okind, odata, o++, 'x');
11165 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11166 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011167 }
11168
Georg Brandl559e5d72008-06-11 18:37:52 +000011169 /* Copy ASCII characters as-is */
11170 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011172 }
11173
Benjamin Peterson29060642009-01-31 22:14:21 +000011174 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011175 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011176 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011177 (categories Z* and C* except ASCII space)
11178 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011180 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 if (ch <= 0xff) {
11182 PyUnicode_WRITE(okind, odata, o++, '\\');
11183 PyUnicode_WRITE(okind, odata, o++, 'x');
11184 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11185 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011186 }
11187 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 else if (ch >= 0x10000) {
11189 PyUnicode_WRITE(okind, odata, o++, '\\');
11190 PyUnicode_WRITE(okind, odata, o++, 'U');
11191 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11192 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11193 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11194 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11195 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11196 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11197 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11198 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011199 }
11200 /* Map 16-bit characters to '\uxxxx' */
11201 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 PyUnicode_WRITE(okind, odata, o++, '\\');
11203 PyUnicode_WRITE(okind, odata, o++, 'u');
11204 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11205 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11206 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11207 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011208 }
11209 }
11210 /* Copy characters as-is */
11211 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011213 }
11214 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011215 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011217 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218}
11219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222\n\
11223Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011224such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225arguments start and end are interpreted as in slice notation.\n\
11226\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228
11229static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231{
Jesus Ceaac451502011-04-20 17:09:23 +020011232 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011233 Py_ssize_t start;
11234 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011235 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
Jesus Ceaac451502011-04-20 17:09:23 +020011237 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11238 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 if (PyUnicode_READY(self) == -1)
11242 return NULL;
11243 if (PyUnicode_READY(substring) == -1)
11244 return NULL;
11245
11246 result = any_find_slice(
11247 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11248 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011249 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
11251 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 if (result == -2)
11254 return NULL;
11255
Christian Heimes217cfd12007-12-02 14:31:20 +000011256 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257}
11258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011259PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011262Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
11264static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266{
Jesus Ceaac451502011-04-20 17:09:23 +020011267 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011268 Py_ssize_t start;
11269 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011270 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271
Jesus Ceaac451502011-04-20 17:09:23 +020011272 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11273 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 if (PyUnicode_READY(self) == -1)
11277 return NULL;
11278 if (PyUnicode_READY(substring) == -1)
11279 return NULL;
11280
11281 result = any_find_slice(
11282 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11283 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011284 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
11286 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (result == -2)
11289 return NULL;
11290
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291 if (result < 0) {
11292 PyErr_SetString(PyExc_ValueError, "substring not found");
11293 return NULL;
11294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295
Christian Heimes217cfd12007-12-02 14:31:20 +000011296 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297}
11298
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011299PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011302Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011303done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
11305static PyObject *
11306unicode_rjust(PyUnicodeObject *self, PyObject *args)
11307{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011308 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 Py_UCS4 fillchar = ' ';
11310
Victor Stinnere9a29352011-10-01 02:14:59 +020011311 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011313
Victor Stinnere9a29352011-10-01 02:14:59 +020011314 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 return NULL;
11316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 Py_INCREF(self);
11319 return (PyObject*) self;
11320 }
11321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323}
11324
Alexander Belopolsky40018472011-02-26 01:02:56 +000011325PyObject *
11326PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327{
11328 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011329
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330 s = PyUnicode_FromObject(s);
11331 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 if (sep != NULL) {
11334 sep = PyUnicode_FromObject(sep);
11335 if (sep == NULL) {
11336 Py_DECREF(s);
11337 return NULL;
11338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 }
11340
11341 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11342
11343 Py_DECREF(s);
11344 Py_XDECREF(sep);
11345 return result;
11346}
11347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011349 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350\n\
11351Return a list of the words in S, using sep as the\n\
11352delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011353splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011354whitespace string is a separator and empty strings are\n\
11355removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
11357static PyObject*
11358unicode_split(PyUnicodeObject *self, PyObject *args)
11359{
11360 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011361 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Martin v. Löwis18e16552006-02-15 17:27:45 +000011363 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 return NULL;
11365
11366 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372}
11373
Thomas Wouters477c8d52006-05-27 19:21:47 +000011374PyObject *
11375PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11376{
11377 PyObject* str_obj;
11378 PyObject* sep_obj;
11379 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 int kind1, kind2, kind;
11381 void *buf1 = NULL, *buf2 = NULL;
11382 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011383
11384 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011385 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011387 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011389 Py_DECREF(str_obj);
11390 return NULL;
11391 }
11392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 kind1 = PyUnicode_KIND(str_in);
11394 kind2 = PyUnicode_KIND(sep_obj);
11395 kind = kind1 > kind2 ? kind1 : kind2;
11396 buf1 = PyUnicode_DATA(str_in);
11397 if (kind1 != kind)
11398 buf1 = _PyUnicode_AsKind(str_in, kind);
11399 if (!buf1)
11400 goto onError;
11401 buf2 = PyUnicode_DATA(sep_obj);
11402 if (kind2 != kind)
11403 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11404 if (!buf2)
11405 goto onError;
11406 len1 = PyUnicode_GET_LENGTH(str_obj);
11407 len2 = PyUnicode_GET_LENGTH(sep_obj);
11408
11409 switch(PyUnicode_KIND(str_in)) {
11410 case PyUnicode_1BYTE_KIND:
11411 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11412 break;
11413 case PyUnicode_2BYTE_KIND:
11414 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11415 break;
11416 case PyUnicode_4BYTE_KIND:
11417 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11418 break;
11419 default:
11420 assert(0);
11421 out = 0;
11422 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011423
11424 Py_DECREF(sep_obj);
11425 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (kind1 != kind)
11427 PyMem_Free(buf1);
11428 if (kind2 != kind)
11429 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011430
11431 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 onError:
11433 Py_DECREF(sep_obj);
11434 Py_DECREF(str_obj);
11435 if (kind1 != kind && buf1)
11436 PyMem_Free(buf1);
11437 if (kind2 != kind && buf2)
11438 PyMem_Free(buf2);
11439 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011440}
11441
11442
11443PyObject *
11444PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11445{
11446 PyObject* str_obj;
11447 PyObject* sep_obj;
11448 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 int kind1, kind2, kind;
11450 void *buf1 = NULL, *buf2 = NULL;
11451 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011452
11453 str_obj = PyUnicode_FromObject(str_in);
11454 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011456 sep_obj = PyUnicode_FromObject(sep_in);
11457 if (!sep_obj) {
11458 Py_DECREF(str_obj);
11459 return NULL;
11460 }
11461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 kind1 = PyUnicode_KIND(str_in);
11463 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011464 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 buf1 = PyUnicode_DATA(str_in);
11466 if (kind1 != kind)
11467 buf1 = _PyUnicode_AsKind(str_in, kind);
11468 if (!buf1)
11469 goto onError;
11470 buf2 = PyUnicode_DATA(sep_obj);
11471 if (kind2 != kind)
11472 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11473 if (!buf2)
11474 goto onError;
11475 len1 = PyUnicode_GET_LENGTH(str_obj);
11476 len2 = PyUnicode_GET_LENGTH(sep_obj);
11477
11478 switch(PyUnicode_KIND(str_in)) {
11479 case PyUnicode_1BYTE_KIND:
11480 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11481 break;
11482 case PyUnicode_2BYTE_KIND:
11483 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11484 break;
11485 case PyUnicode_4BYTE_KIND:
11486 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11487 break;
11488 default:
11489 assert(0);
11490 out = 0;
11491 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011492
11493 Py_DECREF(sep_obj);
11494 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 if (kind1 != kind)
11496 PyMem_Free(buf1);
11497 if (kind2 != kind)
11498 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011499
11500 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 onError:
11502 Py_DECREF(sep_obj);
11503 Py_DECREF(str_obj);
11504 if (kind1 != kind && buf1)
11505 PyMem_Free(buf1);
11506 if (kind2 != kind && buf2)
11507 PyMem_Free(buf2);
11508 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011509}
11510
11511PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011513\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011514Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011516found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011517
11518static PyObject*
11519unicode_partition(PyUnicodeObject *self, PyObject *separator)
11520{
11521 return PyUnicode_Partition((PyObject *)self, separator);
11522}
11523
11524PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011525 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011526\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011527Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011528the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011529separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011530
11531static PyObject*
11532unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11533{
11534 return PyUnicode_RPartition((PyObject *)self, separator);
11535}
11536
Alexander Belopolsky40018472011-02-26 01:02:56 +000011537PyObject *
11538PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011539{
11540 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011541
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011542 s = PyUnicode_FromObject(s);
11543 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011544 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 if (sep != NULL) {
11546 sep = PyUnicode_FromObject(sep);
11547 if (sep == NULL) {
11548 Py_DECREF(s);
11549 return NULL;
11550 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011551 }
11552
11553 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11554
11555 Py_DECREF(s);
11556 Py_XDECREF(sep);
11557 return result;
11558}
11559
11560PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011562\n\
11563Return a list of the words in S, using sep as the\n\
11564delimiter string, starting at the end of the string and\n\
11565working to the front. If maxsplit is given, at most maxsplit\n\
11566splits are done. If sep is not specified, any whitespace string\n\
11567is a separator.");
11568
11569static PyObject*
11570unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11571{
11572 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011573 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011574
Martin v. Löwis18e16552006-02-15 17:27:45 +000011575 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011576 return NULL;
11577
11578 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011580 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011582 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011584}
11585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011586PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588\n\
11589Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011590Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011591is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
11593static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011594unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011596 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011597 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011599 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11600 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601 return NULL;
11602
Guido van Rossum86662912000-04-11 15:38:46 +000011603 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
11606static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011607PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Walter Dörwald346737f2007-05-31 10:44:43 +000011609 if (PyUnicode_CheckExact(self)) {
11610 Py_INCREF(self);
11611 return self;
11612 } else
11613 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011614 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615}
11616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619\n\
11620Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011621and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
11623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011624unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626 return fixup(self, fixswapcase);
11627}
11628
Georg Brandlceee0772007-11-27 23:48:05 +000011629PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011631\n\
11632Return a translation table usable for str.translate().\n\
11633If there is only one argument, it must be a dictionary mapping Unicode\n\
11634ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011635Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011636If there are two arguments, they must be strings of equal length, and\n\
11637in the resulting dictionary, each character in x will be mapped to the\n\
11638character at the same position in y. If there is a third argument, it\n\
11639must be a string, whose characters will be mapped to None in the result.");
11640
11641static PyObject*
11642unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11643{
11644 PyObject *x, *y = NULL, *z = NULL;
11645 PyObject *new = NULL, *key, *value;
11646 Py_ssize_t i = 0;
11647 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011648
Georg Brandlceee0772007-11-27 23:48:05 +000011649 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11650 return NULL;
11651 new = PyDict_New();
11652 if (!new)
11653 return NULL;
11654 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 int x_kind, y_kind, z_kind;
11656 void *x_data, *y_data, *z_data;
11657
Georg Brandlceee0772007-11-27 23:48:05 +000011658 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011659 if (!PyUnicode_Check(x)) {
11660 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11661 "be a string if there is a second argument");
11662 goto err;
11663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011665 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11666 "arguments must have equal length");
11667 goto err;
11668 }
11669 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 x_kind = PyUnicode_KIND(x);
11671 y_kind = PyUnicode_KIND(y);
11672 x_data = PyUnicode_DATA(x);
11673 y_data = PyUnicode_DATA(y);
11674 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11675 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11676 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011677 if (!key || !value)
11678 goto err;
11679 res = PyDict_SetItem(new, key, value);
11680 Py_DECREF(key);
11681 Py_DECREF(value);
11682 if (res < 0)
11683 goto err;
11684 }
11685 /* create entries for deleting chars in z */
11686 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 z_kind = PyUnicode_KIND(z);
11688 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011689 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011691 if (!key)
11692 goto err;
11693 res = PyDict_SetItem(new, key, Py_None);
11694 Py_DECREF(key);
11695 if (res < 0)
11696 goto err;
11697 }
11698 }
11699 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 int kind;
11701 void *data;
11702
Georg Brandlceee0772007-11-27 23:48:05 +000011703 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011704 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011705 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11706 "to maketrans it must be a dict");
11707 goto err;
11708 }
11709 /* copy entries into the new dict, converting string keys to int keys */
11710 while (PyDict_Next(x, &i, &key, &value)) {
11711 if (PyUnicode_Check(key)) {
11712 /* convert string keys to integer keys */
11713 PyObject *newkey;
11714 if (PyUnicode_GET_SIZE(key) != 1) {
11715 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11716 "table must be of length 1");
11717 goto err;
11718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 kind = PyUnicode_KIND(key);
11720 data = PyUnicode_DATA(key);
11721 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011722 if (!newkey)
11723 goto err;
11724 res = PyDict_SetItem(new, newkey, value);
11725 Py_DECREF(newkey);
11726 if (res < 0)
11727 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011728 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011729 /* just keep integer keys */
11730 if (PyDict_SetItem(new, key, value) < 0)
11731 goto err;
11732 } else {
11733 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11734 "be strings or integers");
11735 goto err;
11736 }
11737 }
11738 }
11739 return new;
11740 err:
11741 Py_DECREF(new);
11742 return NULL;
11743}
11744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011745PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747\n\
11748Return a copy of the string S, where all characters have been mapped\n\
11749through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011750Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011751Unmapped characters are left untouched. Characters mapped to None\n\
11752are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
11754static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758}
11759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011760PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011766unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 return fixup(self, fixupper);
11769}
11770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011771PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011774Pad a numeric string S with zeros on the left, to fill a field\n\
11775of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
11777static PyObject *
11778unicode_zfill(PyUnicodeObject *self, PyObject *args)
11779{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011780 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011782 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 int kind;
11784 void *data;
11785 Py_UCS4 chr;
11786
11787 if (PyUnicode_READY(self) == -1)
11788 return NULL;
11789
Martin v. Löwis18e16552006-02-15 17:27:45 +000011790 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791 return NULL;
11792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011794 if (PyUnicode_CheckExact(self)) {
11795 Py_INCREF(self);
11796 return (PyObject*) self;
11797 }
11798 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011799 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 }
11801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
11804 u = pad(self, fill, 0, '0');
11805
Walter Dörwald068325e2002-04-15 13:36:47 +000011806 if (u == NULL)
11807 return NULL;
11808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 kind = PyUnicode_KIND(u);
11810 data = PyUnicode_DATA(u);
11811 chr = PyUnicode_READ(kind, data, fill);
11812
11813 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 PyUnicode_WRITE(kind, data, 0, chr);
11816 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 }
11818
11819 return (PyObject*) u;
11820}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
11822#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011823static PyObject *
11824unicode__decimal2ascii(PyObject *self)
11825{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011827}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828#endif
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011833Return True if S starts with the specified prefix, False otherwise.\n\
11834With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011835With optional end, stop comparing S at that position.\n\
11836prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
11838static PyObject *
11839unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011842 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011844 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011845 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011846 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847
Jesus Ceaac451502011-04-20 17:09:23 +020011848 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011850 if (PyTuple_Check(subobj)) {
11851 Py_ssize_t i;
11852 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11853 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011855 if (substring == NULL)
11856 return NULL;
11857 result = tailmatch(self, substring, start, end, -1);
11858 Py_DECREF(substring);
11859 if (result) {
11860 Py_RETURN_TRUE;
11861 }
11862 }
11863 /* nothing matched */
11864 Py_RETURN_FALSE;
11865 }
11866 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011867 if (substring == NULL) {
11868 if (PyErr_ExceptionMatches(PyExc_TypeError))
11869 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11870 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011872 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011873 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011875 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876}
11877
11878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011879PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011882Return True if S ends with the specified suffix, False otherwise.\n\
11883With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011884With optional end, stop comparing S at that position.\n\
11885suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject *
11888unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011891 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011893 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011894 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011895 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
Jesus Ceaac451502011-04-20 17:09:23 +020011897 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011899 if (PyTuple_Check(subobj)) {
11900 Py_ssize_t i;
11901 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11902 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011903 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011904 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011906 result = tailmatch(self, substring, start, end, +1);
11907 Py_DECREF(substring);
11908 if (result) {
11909 Py_RETURN_TRUE;
11910 }
11911 }
11912 Py_RETURN_FALSE;
11913 }
11914 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011915 if (substring == NULL) {
11916 if (PyErr_ExceptionMatches(PyExc_TypeError))
11917 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11918 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011920 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011921 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011923 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924}
11925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011927
11928PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011930\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011931Return a formatted version of S, using substitutions from args and kwargs.\n\
11932The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011933
Eric Smith27bbca62010-11-04 17:06:58 +000011934PyDoc_STRVAR(format_map__doc__,
11935 "S.format_map(mapping) -> str\n\
11936\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011937Return a formatted version of S, using substitutions from mapping.\n\
11938The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011939
Eric Smith4a7d76d2008-05-30 18:10:19 +000011940static PyObject *
11941unicode__format__(PyObject* self, PyObject* args)
11942{
11943 PyObject *format_spec;
11944
11945 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11946 return NULL;
11947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11949 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011950}
11951
Eric Smith8c663262007-08-25 02:26:07 +000011952PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011954\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011955Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011956
11957static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011958unicode__sizeof__(PyUnicodeObject *v)
11959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 Py_ssize_t size;
11961
11962 /* If it's a compact object, account for base structure +
11963 character data. */
11964 if (PyUnicode_IS_COMPACT_ASCII(v))
11965 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11966 else if (PyUnicode_IS_COMPACT(v))
11967 size = sizeof(PyCompactUnicodeObject) +
11968 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11969 else {
11970 /* If it is a two-block object, account for base object, and
11971 for character block if present. */
11972 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011973 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 size += (PyUnicode_GET_LENGTH(v) + 1) *
11975 PyUnicode_CHARACTER_SIZE(v);
11976 }
11977 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011978 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011980 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011982 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011983 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984
11985 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011986}
11987
11988PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011990
11991static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011992unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011993{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011994 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (!copy)
11996 return NULL;
11997 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011998}
11999
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000static PyMethodDef unicode_methods[] = {
12001
12002 /* Order is according to common usage: often used methods should
12003 appear first, since lookup is done sequentially. */
12004
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012005 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012006 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12007 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012008 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012009 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12010 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12011 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12012 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12013 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12014 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12015 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012016 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012017 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12018 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12019 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012020 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012021 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12022 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12023 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012024 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012025 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012026 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012027 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012028 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12029 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12030 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12031 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12032 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12033 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12034 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12035 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12036 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12037 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12038 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12039 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12040 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12041 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012042 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012043 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012044 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012045 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012046 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012047 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012048 {"maketrans", (PyCFunction) unicode_maketrans,
12049 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012050 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012051#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012052 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053#endif
12054
12055#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012056 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012057 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058#endif
12059
Benjamin Peterson14339b62009-01-31 16:36:08 +000012060 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061 {NULL, NULL}
12062};
12063
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012064static PyObject *
12065unicode_mod(PyObject *v, PyObject *w)
12066{
Brian Curtindfc80e32011-08-10 20:28:54 -050012067 if (!PyUnicode_Check(v))
12068 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012070}
12071
12072static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012073 0, /*nb_add*/
12074 0, /*nb_subtract*/
12075 0, /*nb_multiply*/
12076 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012077};
12078
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 (lenfunc) unicode_length, /* sq_length */
12081 PyUnicode_Concat, /* sq_concat */
12082 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12083 (ssizeargfunc) unicode_getitem, /* sq_item */
12084 0, /* sq_slice */
12085 0, /* sq_ass_item */
12086 0, /* sq_ass_slice */
12087 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088};
12089
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012090static PyObject*
12091unicode_subscript(PyUnicodeObject* self, PyObject* item)
12092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (PyUnicode_READY(self) == -1)
12094 return NULL;
12095
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012096 if (PyIndex_Check(item)) {
12097 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012098 if (i == -1 && PyErr_Occurred())
12099 return NULL;
12100 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012102 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012103 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012104 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012106 Py_UNICODE* result_buf;
12107 PyObject* result;
12108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012110 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012111 return NULL;
12112 }
12113
12114 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 return PyUnicode_New(0, 0);
12116 } else if (start == 0 && step == 1 &&
12117 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012118 PyUnicode_CheckExact(self)) {
12119 Py_INCREF(self);
12120 return (PyObject *)self;
12121 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012122 return PyUnicode_Substring((PyObject*)self,
12123 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012124 } else {
12125 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012126 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12127 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012128
Benjamin Peterson29060642009-01-31 22:14:21 +000012129 if (result_buf == NULL)
12130 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012131
12132 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12133 result_buf[i] = source_buf[cur];
12134 }
Tim Petersced69f82003-09-16 20:30:58 +000012135
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012136 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012137 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012138 return result;
12139 }
12140 } else {
12141 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12142 return NULL;
12143 }
12144}
12145
12146static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 (lenfunc)unicode_length, /* mp_length */
12148 (binaryfunc)unicode_subscript, /* mp_subscript */
12149 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012150};
12151
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153/* Helpers for PyUnicode_Format() */
12154
12155static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012156getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012158 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 (*p_argidx)++;
12161 if (arglen < 0)
12162 return args;
12163 else
12164 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 }
12166 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012167 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 return NULL;
12169}
12170
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012171/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012173static PyObject *
12174formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012176 char *p;
12177 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012179
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180 x = PyFloat_AsDouble(v);
12181 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012182 return NULL;
12183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012186
Eric Smith0923d1d2009-04-16 20:16:10 +000012187 p = PyOS_double_to_string(x, type, prec,
12188 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012189 if (p == NULL)
12190 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012192 PyMem_Free(p);
12193 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
Tim Peters38fd5b62000-09-21 05:43:11 +000012196static PyObject*
12197formatlong(PyObject *val, int flags, int prec, int type)
12198{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012199 char *buf;
12200 int len;
12201 PyObject *str; /* temporary string object. */
12202 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012203
Benjamin Peterson14339b62009-01-31 16:36:08 +000012204 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12205 if (!str)
12206 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012208 Py_DECREF(str);
12209 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012210}
12211
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012214 size_t buflen,
12215 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012217 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012218 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 if (PyUnicode_GET_LENGTH(v) == 1) {
12220 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 buf[1] = '\0';
12222 return 1;
12223 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 goto onError;
12225 }
12226 else {
12227 /* Integer input truncated to a character */
12228 long x;
12229 x = PyLong_AsLong(v);
12230 if (x == -1 && PyErr_Occurred())
12231 goto onError;
12232
12233 if (x < 0 || x > 0x10ffff) {
12234 PyErr_SetString(PyExc_OverflowError,
12235 "%c arg not in range(0x110000)");
12236 return -1;
12237 }
12238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012240 buf[1] = '\0';
12241 return 1;
12242 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012243
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012245 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012247 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012250/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012251 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012252*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012253#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012254
Alexander Belopolsky40018472011-02-26 01:02:56 +000012255PyObject *
12256PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 void *fmt;
12259 int fmtkind;
12260 PyObject *result;
12261 Py_UCS4 *res, *res0;
12262 Py_UCS4 max;
12263 int kind;
12264 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012268
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 PyErr_BadInternalCall();
12271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12274 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 fmt = PyUnicode_DATA(uformat);
12277 fmtkind = PyUnicode_KIND(uformat);
12278 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12279 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
12281 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12283 if (res0 == NULL) {
12284 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
12288 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 arglen = PyTuple_Size(args);
12290 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 }
12292 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 arglen = -1;
12294 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012296 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012297 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299
12300 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012302 if (--rescnt < 0) {
12303 rescnt = fmtcnt + 100;
12304 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12306 if (res0 == NULL){
12307 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 }
12310 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314 }
12315 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 /* Got a format specifier */
12317 int flags = 0;
12318 Py_ssize_t width = -1;
12319 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 Py_UCS4 c = '\0';
12321 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 int isnumok;
12323 PyObject *v = NULL;
12324 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 void *pbuf;
12326 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 Py_ssize_t len, len1;
12329 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 fmtpos++;
12332 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12333 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 Py_ssize_t keylen;
12335 PyObject *key;
12336 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012337
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 if (dict == NULL) {
12339 PyErr_SetString(PyExc_TypeError,
12340 "format requires a mapping");
12341 goto onError;
12342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 /* Skip over balanced parentheses */
12347 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 if (fmtcnt < 0 || pcount > 0) {
12356 PyErr_SetString(PyExc_ValueError,
12357 "incomplete format key");
12358 goto onError;
12359 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012360 key = PyUnicode_Substring((PyObject*)uformat,
12361 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 if (key == NULL)
12363 goto onError;
12364 if (args_owned) {
12365 Py_DECREF(args);
12366 args_owned = 0;
12367 }
12368 args = PyObject_GetItem(dict, key);
12369 Py_DECREF(key);
12370 if (args == NULL) {
12371 goto onError;
12372 }
12373 args_owned = 1;
12374 arglen = -1;
12375 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012376 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 case '-': flags |= F_LJUST; continue;
12380 case '+': flags |= F_SIGN; continue;
12381 case ' ': flags |= F_BLANK; continue;
12382 case '#': flags |= F_ALT; continue;
12383 case '0': flags |= F_ZERO; continue;
12384 }
12385 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012386 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 if (c == '*') {
12388 v = getnextarg(args, arglen, &argidx);
12389 if (v == NULL)
12390 goto onError;
12391 if (!PyLong_Check(v)) {
12392 PyErr_SetString(PyExc_TypeError,
12393 "* wants int");
12394 goto onError;
12395 }
12396 width = PyLong_AsLong(v);
12397 if (width == -1 && PyErr_Occurred())
12398 goto onError;
12399 if (width < 0) {
12400 flags |= F_LJUST;
12401 width = -width;
12402 }
12403 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 }
12406 else if (c >= '0' && c <= '9') {
12407 width = c - '0';
12408 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 if (c < '0' || c > '9')
12411 break;
12412 if ((width*10) / 10 != width) {
12413 PyErr_SetString(PyExc_ValueError,
12414 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 }
12417 width = width*10 + (c - '0');
12418 }
12419 }
12420 if (c == '.') {
12421 prec = 0;
12422 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 if (c == '*') {
12425 v = getnextarg(args, arglen, &argidx);
12426 if (v == NULL)
12427 goto onError;
12428 if (!PyLong_Check(v)) {
12429 PyErr_SetString(PyExc_TypeError,
12430 "* wants int");
12431 goto onError;
12432 }
12433 prec = PyLong_AsLong(v);
12434 if (prec == -1 && PyErr_Occurred())
12435 goto onError;
12436 if (prec < 0)
12437 prec = 0;
12438 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 }
12441 else if (c >= '0' && c <= '9') {
12442 prec = c - '0';
12443 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 if (c < '0' || c > '9')
12446 break;
12447 if ((prec*10) / 10 != prec) {
12448 PyErr_SetString(PyExc_ValueError,
12449 "prec too big");
12450 goto onError;
12451 }
12452 prec = prec*10 + (c - '0');
12453 }
12454 }
12455 } /* prec */
12456 if (fmtcnt >= 0) {
12457 if (c == 'h' || c == 'l' || c == 'L') {
12458 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 }
12461 }
12462 if (fmtcnt < 0) {
12463 PyErr_SetString(PyExc_ValueError,
12464 "incomplete format");
12465 goto onError;
12466 }
12467 if (c != '%') {
12468 v = getnextarg(args, arglen, &argidx);
12469 if (v == NULL)
12470 goto onError;
12471 }
12472 sign = 0;
12473 fill = ' ';
12474 switch (c) {
12475
12476 case '%':
12477 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 len = 1;
12482 break;
12483
12484 case 's':
12485 case 'r':
12486 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012487 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 temp = v;
12489 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012490 }
12491 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 if (c == 's')
12493 temp = PyObject_Str(v);
12494 else if (c == 'r')
12495 temp = PyObject_Repr(v);
12496 else
12497 temp = PyObject_ASCII(v);
12498 if (temp == NULL)
12499 goto onError;
12500 if (PyUnicode_Check(temp))
12501 /* nothing to do */;
12502 else {
12503 Py_DECREF(temp);
12504 PyErr_SetString(PyExc_TypeError,
12505 "%s argument has non-string str()");
12506 goto onError;
12507 }
12508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 if (PyUnicode_READY(temp) == -1) {
12510 Py_CLEAR(temp);
12511 goto onError;
12512 }
12513 pbuf = PyUnicode_DATA(temp);
12514 kind = PyUnicode_KIND(temp);
12515 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012516 if (prec >= 0 && len > prec)
12517 len = prec;
12518 break;
12519
12520 case 'i':
12521 case 'd':
12522 case 'u':
12523 case 'o':
12524 case 'x':
12525 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 isnumok = 0;
12527 if (PyNumber_Check(v)) {
12528 PyObject *iobj=NULL;
12529
12530 if (PyLong_Check(v)) {
12531 iobj = v;
12532 Py_INCREF(iobj);
12533 }
12534 else {
12535 iobj = PyNumber_Long(v);
12536 }
12537 if (iobj!=NULL) {
12538 if (PyLong_Check(iobj)) {
12539 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012540 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 Py_DECREF(iobj);
12542 if (!temp)
12543 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 if (PyUnicode_READY(temp) == -1) {
12545 Py_CLEAR(temp);
12546 goto onError;
12547 }
12548 pbuf = PyUnicode_DATA(temp);
12549 kind = PyUnicode_KIND(temp);
12550 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 sign = 1;
12552 }
12553 else {
12554 Py_DECREF(iobj);
12555 }
12556 }
12557 }
12558 if (!isnumok) {
12559 PyErr_Format(PyExc_TypeError,
12560 "%%%c format: a number is required, "
12561 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12562 goto onError;
12563 }
12564 if (flags & F_ZERO)
12565 fill = '0';
12566 break;
12567
12568 case 'e':
12569 case 'E':
12570 case 'f':
12571 case 'F':
12572 case 'g':
12573 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012574 temp = formatfloat(v, flags, prec, c);
12575 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 if (PyUnicode_READY(temp) == -1) {
12578 Py_CLEAR(temp);
12579 goto onError;
12580 }
12581 pbuf = PyUnicode_DATA(temp);
12582 kind = PyUnicode_KIND(temp);
12583 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 sign = 1;
12585 if (flags & F_ZERO)
12586 fill = '0';
12587 break;
12588
12589 case 'c':
12590 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012592 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 if (len < 0)
12594 goto onError;
12595 break;
12596
12597 default:
12598 PyErr_Format(PyExc_ValueError,
12599 "unsupported format character '%c' (0x%x) "
12600 "at index %zd",
12601 (31<=c && c<=126) ? (char)c : '?',
12602 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012604 goto onError;
12605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 /* pbuf is initialized here. */
12607 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12610 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12611 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 len--;
12613 }
12614 else if (flags & F_SIGN)
12615 sign = '+';
12616 else if (flags & F_BLANK)
12617 sign = ' ';
12618 else
12619 sign = 0;
12620 }
12621 if (width < len)
12622 width = len;
12623 if (rescnt - (sign != 0) < width) {
12624 reslen -= rescnt;
12625 rescnt = width + fmtcnt + 100;
12626 reslen += rescnt;
12627 if (reslen < 0) {
12628 Py_XDECREF(temp);
12629 PyErr_NoMemory();
12630 goto onError;
12631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12633 if (res0 == 0) {
12634 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 Py_XDECREF(temp);
12636 goto onError;
12637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 }
12640 if (sign) {
12641 if (fill != ' ')
12642 *res++ = sign;
12643 rescnt--;
12644 if (width > len)
12645 width--;
12646 }
12647 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12649 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12652 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 }
12654 rescnt -= 2;
12655 width -= 2;
12656 if (width < 0)
12657 width = 0;
12658 len -= 2;
12659 }
12660 if (width > len && !(flags & F_LJUST)) {
12661 do {
12662 --rescnt;
12663 *res++ = fill;
12664 } while (--width > len);
12665 }
12666 if (fill == ' ') {
12667 if (sign)
12668 *res++ = sign;
12669 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12671 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12672 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12673 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012674 }
12675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 /* Copy all characters, preserving len */
12677 len1 = len;
12678 while (len1--) {
12679 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12680 rescnt--;
12681 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 while (--width >= len) {
12683 --rescnt;
12684 *res++ = ' ';
12685 }
12686 if (dict && (argidx < arglen) && c != '%') {
12687 PyErr_SetString(PyExc_TypeError,
12688 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012689 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 goto onError;
12691 }
12692 Py_XDECREF(temp);
12693 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694 } /* until end */
12695 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 PyErr_SetString(PyExc_TypeError,
12697 "not all arguments converted during string formatting");
12698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 }
12700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701
12702 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12703 if (*res > max)
12704 max = *res;
12705 result = PyUnicode_New(reslen - rescnt, max);
12706 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 kind = PyUnicode_KIND(result);
12709 for (res = res0; res < res0+reslen-rescnt; res++)
12710 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12711 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 }
12715 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716 return (PyObject *)result;
12717
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 Py_DECREF(uformat);
12721 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723 }
12724 return NULL;
12725}
12726
Jeremy Hylton938ace62002-07-17 16:30:39 +000012727static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012728unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12729
Tim Peters6d6c1a32001-08-02 04:15:00 +000012730static PyObject *
12731unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12732{
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012734 static char *kwlist[] = {"object", "encoding", "errors", 0};
12735 char *encoding = NULL;
12736 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012737
Benjamin Peterson14339b62009-01-31 16:36:08 +000012738 if (type != &PyUnicode_Type)
12739 return unicode_subtype_new(type, args, kwds);
12740 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 return NULL;
12743 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 if (encoding == NULL && errors == NULL)
12746 return PyObject_Str(x);
12747 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012749}
12750
Guido van Rossume023fe02001-08-30 03:12:59 +000012751static PyObject *
12752unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12753{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012754 PyUnicodeObject *unicode, *self;
12755 Py_ssize_t length, char_size;
12756 int share_wstr, share_utf8;
12757 unsigned int kind;
12758 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012759
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012761
12762 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12763 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012765 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012766 if (PyUnicode_READY(unicode))
12767 return NULL;
12768
12769 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12770 if (self == NULL) {
12771 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012772 return NULL;
12773 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012774 kind = PyUnicode_KIND(unicode);
12775 length = PyUnicode_GET_LENGTH(unicode);
12776
12777 _PyUnicode_LENGTH(self) = length;
12778 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12779 _PyUnicode_STATE(self).interned = 0;
12780 _PyUnicode_STATE(self).kind = kind;
12781 _PyUnicode_STATE(self).compact = 0;
12782 _PyUnicode_STATE(self).ascii = 0;
12783 _PyUnicode_STATE(self).ready = 1;
12784 _PyUnicode_WSTR(self) = NULL;
12785 _PyUnicode_UTF8_LENGTH(self) = 0;
12786 _PyUnicode_UTF8(self) = NULL;
12787 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012788 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012789
12790 share_utf8 = 0;
12791 share_wstr = 0;
12792 if (kind == PyUnicode_1BYTE_KIND) {
12793 char_size = 1;
12794 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12795 share_utf8 = 1;
12796 }
12797 else if (kind == PyUnicode_2BYTE_KIND) {
12798 char_size = 2;
12799 if (sizeof(wchar_t) == 2)
12800 share_wstr = 1;
12801 }
12802 else {
12803 assert(kind == PyUnicode_4BYTE_KIND);
12804 char_size = 4;
12805 if (sizeof(wchar_t) == 4)
12806 share_wstr = 1;
12807 }
12808
12809 /* Ensure we won't overflow the length. */
12810 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12811 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012814 data = PyObject_MALLOC((length + 1) * char_size);
12815 if (data == NULL) {
12816 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012817 goto onError;
12818 }
12819
Victor Stinnerc3c74152011-10-02 20:39:55 +020012820 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012821 if (share_utf8) {
12822 _PyUnicode_UTF8_LENGTH(self) = length;
12823 _PyUnicode_UTF8(self) = data;
12824 }
12825 if (share_wstr) {
12826 _PyUnicode_WSTR_LENGTH(self) = length;
12827 _PyUnicode_WSTR(self) = (wchar_t *)data;
12828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012830 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12831 PyUnicode_KIND_SIZE(kind, length + 1));
12832 Py_DECREF(unicode);
12833 return (PyObject *)self;
12834
12835onError:
12836 Py_DECREF(unicode);
12837 Py_DECREF(self);
12838 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012839}
12840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012841PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012843\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012844Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012845encoding defaults to the current default string encoding.\n\
12846errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012847
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012848static PyObject *unicode_iter(PyObject *seq);
12849
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012851 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012852 "str", /* tp_name */
12853 sizeof(PyUnicodeObject), /* tp_size */
12854 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012856 (destructor)unicode_dealloc, /* tp_dealloc */
12857 0, /* tp_print */
12858 0, /* tp_getattr */
12859 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012860 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012861 unicode_repr, /* tp_repr */
12862 &unicode_as_number, /* tp_as_number */
12863 &unicode_as_sequence, /* tp_as_sequence */
12864 &unicode_as_mapping, /* tp_as_mapping */
12865 (hashfunc) unicode_hash, /* tp_hash*/
12866 0, /* tp_call*/
12867 (reprfunc) unicode_str, /* tp_str */
12868 PyObject_GenericGetAttr, /* tp_getattro */
12869 0, /* tp_setattro */
12870 0, /* tp_as_buffer */
12871 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 unicode_doc, /* tp_doc */
12874 0, /* tp_traverse */
12875 0, /* tp_clear */
12876 PyUnicode_RichCompare, /* tp_richcompare */
12877 0, /* tp_weaklistoffset */
12878 unicode_iter, /* tp_iter */
12879 0, /* tp_iternext */
12880 unicode_methods, /* tp_methods */
12881 0, /* tp_members */
12882 0, /* tp_getset */
12883 &PyBaseObject_Type, /* tp_base */
12884 0, /* tp_dict */
12885 0, /* tp_descr_get */
12886 0, /* tp_descr_set */
12887 0, /* tp_dictoffset */
12888 0, /* tp_init */
12889 0, /* tp_alloc */
12890 unicode_new, /* tp_new */
12891 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892};
12893
12894/* Initialize the Unicode implementation */
12895
Thomas Wouters78890102000-07-22 19:25:51 +000012896void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012898 int i;
12899
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012902 0x000A, /* LINE FEED */
12903 0x000D, /* CARRIAGE RETURN */
12904 0x001C, /* FILE SEPARATOR */
12905 0x001D, /* GROUP SEPARATOR */
12906 0x001E, /* RECORD SEPARATOR */
12907 0x0085, /* NEXT LINE */
12908 0x2028, /* LINE SEPARATOR */
12909 0x2029, /* PARAGRAPH SEPARATOR */
12910 };
12911
Fred Drakee4315f52000-05-09 19:53:39 +000012912 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012913 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012914 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012916
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012917 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012918 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012919 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012920 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921
12922 /* initialize the linebreak bloom filter */
12923 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012925 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012926
12927 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928}
12929
12930/* Finalize the Unicode implementation */
12931
Christian Heimesa156e092008-02-16 07:38:31 +000012932int
12933PyUnicode_ClearFreeList(void)
12934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012936}
12937
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938void
Thomas Wouters78890102000-07-22 19:25:51 +000012939_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012941 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012943 Py_XDECREF(unicode_empty);
12944 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012945
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012946 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012947 if (unicode_latin1[i]) {
12948 Py_DECREF(unicode_latin1[i]);
12949 unicode_latin1[i] = NULL;
12950 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012951 }
Christian Heimesa156e092008-02-16 07:38:31 +000012952 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012954
Walter Dörwald16807132007-05-25 13:52:07 +000012955void
12956PyUnicode_InternInPlace(PyObject **p)
12957{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012958 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12959 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012960#ifdef Py_DEBUG
12961 assert(s != NULL);
12962 assert(_PyUnicode_CHECK(s));
12963#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012965 return;
12966#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012967 /* If it's a subclass, we don't really know what putting
12968 it in the interned dict might do. */
12969 if (!PyUnicode_CheckExact(s))
12970 return;
12971 if (PyUnicode_CHECK_INTERNED(s))
12972 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012974 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 return;
12976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012977 if (interned == NULL) {
12978 interned = PyDict_New();
12979 if (interned == NULL) {
12980 PyErr_Clear(); /* Don't leave an exception */
12981 return;
12982 }
12983 }
12984 /* It might be that the GetItem call fails even
12985 though the key is present in the dictionary,
12986 namely when this happens during a stack overflow. */
12987 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012989 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012990
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 if (t) {
12992 Py_INCREF(t);
12993 Py_DECREF(*p);
12994 *p = t;
12995 return;
12996 }
Walter Dörwald16807132007-05-25 13:52:07 +000012997
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 PyThreadState_GET()->recursion_critical = 1;
12999 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13000 PyErr_Clear();
13001 PyThreadState_GET()->recursion_critical = 0;
13002 return;
13003 }
13004 PyThreadState_GET()->recursion_critical = 0;
13005 /* The two references in interned are not counted by refcnt.
13006 The deallocator will take care of this */
13007 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013009}
13010
13011void
13012PyUnicode_InternImmortal(PyObject **p)
13013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13015
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 PyUnicode_InternInPlace(p);
13017 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013019 Py_INCREF(*p);
13020 }
Walter Dörwald16807132007-05-25 13:52:07 +000013021}
13022
13023PyObject *
13024PyUnicode_InternFromString(const char *cp)
13025{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013026 PyObject *s = PyUnicode_FromString(cp);
13027 if (s == NULL)
13028 return NULL;
13029 PyUnicode_InternInPlace(&s);
13030 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013031}
13032
Alexander Belopolsky40018472011-02-26 01:02:56 +000013033void
13034_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013035{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013036 PyObject *keys;
13037 PyUnicodeObject *s;
13038 Py_ssize_t i, n;
13039 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013040
Benjamin Peterson14339b62009-01-31 16:36:08 +000013041 if (interned == NULL || !PyDict_Check(interned))
13042 return;
13043 keys = PyDict_Keys(interned);
13044 if (keys == NULL || !PyList_Check(keys)) {
13045 PyErr_Clear();
13046 return;
13047 }
Walter Dörwald16807132007-05-25 13:52:07 +000013048
Benjamin Peterson14339b62009-01-31 16:36:08 +000013049 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13050 detector, interned unicode strings are not forcibly deallocated;
13051 rather, we give them their stolen references back, and then clear
13052 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013053
Benjamin Peterson14339b62009-01-31 16:36:08 +000013054 n = PyList_GET_SIZE(keys);
13055 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013056 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013057 for (i = 0; i < n; i++) {
13058 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 if (PyUnicode_READY(s) == -1)
13060 fprintf(stderr, "could not ready string\n");
13061 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013062 case SSTATE_NOT_INTERNED:
13063 /* XXX Shouldn't happen */
13064 break;
13065 case SSTATE_INTERNED_IMMORTAL:
13066 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 break;
13069 case SSTATE_INTERNED_MORTAL:
13070 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 break;
13073 default:
13074 Py_FatalError("Inconsistent interned string state.");
13075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013077 }
13078 fprintf(stderr, "total size of all interned strings: "
13079 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13080 "mortal/immortal\n", mortal_size, immortal_size);
13081 Py_DECREF(keys);
13082 PyDict_Clear(interned);
13083 Py_DECREF(interned);
13084 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013085}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013086
13087
13088/********************* Unicode Iterator **************************/
13089
13090typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013091 PyObject_HEAD
13092 Py_ssize_t it_index;
13093 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013094} unicodeiterobject;
13095
13096static void
13097unicodeiter_dealloc(unicodeiterobject *it)
13098{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013099 _PyObject_GC_UNTRACK(it);
13100 Py_XDECREF(it->it_seq);
13101 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013102}
13103
13104static int
13105unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13106{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013107 Py_VISIT(it->it_seq);
13108 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013109}
13110
13111static PyObject *
13112unicodeiter_next(unicodeiterobject *it)
13113{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 PyUnicodeObject *seq;
13115 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013116
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 assert(it != NULL);
13118 seq = it->it_seq;
13119 if (seq == NULL)
13120 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013121 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13124 int kind = PyUnicode_KIND(seq);
13125 void *data = PyUnicode_DATA(seq);
13126 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13127 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013128 if (item != NULL)
13129 ++it->it_index;
13130 return item;
13131 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013132
Benjamin Peterson14339b62009-01-31 16:36:08 +000013133 Py_DECREF(seq);
13134 it->it_seq = NULL;
13135 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013136}
13137
13138static PyObject *
13139unicodeiter_len(unicodeiterobject *it)
13140{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 Py_ssize_t len = 0;
13142 if (it->it_seq)
13143 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13144 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013145}
13146
13147PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13148
13149static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013151 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013152 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013153};
13154
13155PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013156 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13157 "str_iterator", /* tp_name */
13158 sizeof(unicodeiterobject), /* tp_basicsize */
13159 0, /* tp_itemsize */
13160 /* methods */
13161 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13162 0, /* tp_print */
13163 0, /* tp_getattr */
13164 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013165 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 0, /* tp_repr */
13167 0, /* tp_as_number */
13168 0, /* tp_as_sequence */
13169 0, /* tp_as_mapping */
13170 0, /* tp_hash */
13171 0, /* tp_call */
13172 0, /* tp_str */
13173 PyObject_GenericGetAttr, /* tp_getattro */
13174 0, /* tp_setattro */
13175 0, /* tp_as_buffer */
13176 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13177 0, /* tp_doc */
13178 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13179 0, /* tp_clear */
13180 0, /* tp_richcompare */
13181 0, /* tp_weaklistoffset */
13182 PyObject_SelfIter, /* tp_iter */
13183 (iternextfunc)unicodeiter_next, /* tp_iternext */
13184 unicodeiter_methods, /* tp_methods */
13185 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013186};
13187
13188static PyObject *
13189unicode_iter(PyObject *seq)
13190{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013191 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013192
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 if (!PyUnicode_Check(seq)) {
13194 PyErr_BadInternalCall();
13195 return NULL;
13196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 if (PyUnicode_READY(seq) == -1)
13198 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013199 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13200 if (it == NULL)
13201 return NULL;
13202 it->it_index = 0;
13203 Py_INCREF(seq);
13204 it->it_seq = (PyUnicodeObject *)seq;
13205 _PyObject_GC_TRACK(it);
13206 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013207}
13208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209#define UNIOP(x) Py_UNICODE_##x
13210#define UNIOP_t Py_UNICODE
13211#include "uniops.h"
13212#undef UNIOP
13213#undef UNIOP_t
13214#define UNIOP(x) Py_UCS4_##x
13215#define UNIOP_t Py_UCS4
13216#include "uniops.h"
13217#undef UNIOP
13218#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013219
Victor Stinner71133ff2010-09-01 23:43:53 +000013220Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013221PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013222{
13223 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13224 Py_UNICODE *copy;
13225 Py_ssize_t size;
13226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 if (!PyUnicode_Check(unicode)) {
13228 PyErr_BadArgument();
13229 return NULL;
13230 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013231 /* Ensure we won't overflow the size. */
13232 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13233 PyErr_NoMemory();
13234 return NULL;
13235 }
13236 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13237 size *= sizeof(Py_UNICODE);
13238 copy = PyMem_Malloc(size);
13239 if (copy == NULL) {
13240 PyErr_NoMemory();
13241 return NULL;
13242 }
13243 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13244 return copy;
13245}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013246
Georg Brandl66c221e2010-10-14 07:04:07 +000013247/* A _string module, to export formatter_parser and formatter_field_name_split
13248 to the string.Formatter class implemented in Python. */
13249
13250static PyMethodDef _string_methods[] = {
13251 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13252 METH_O, PyDoc_STR("split the argument as a field name")},
13253 {"formatter_parser", (PyCFunction) formatter_parser,
13254 METH_O, PyDoc_STR("parse the argument as a format string")},
13255 {NULL, NULL}
13256};
13257
13258static struct PyModuleDef _string_module = {
13259 PyModuleDef_HEAD_INIT,
13260 "_string",
13261 PyDoc_STR("string helper module"),
13262 0,
13263 _string_methods,
13264 NULL,
13265 NULL,
13266 NULL,
13267 NULL
13268};
13269
13270PyMODINIT_FUNC
13271PyInit__string(void)
13272{
13273 return PyModule_Create(&_string_module);
13274}
13275
13276
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013277#ifdef __cplusplus
13278}
13279#endif