blob: ddfe566bcee08177aa5949f069ffad968e3dd153 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
446 int share_wstr;
447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
453
454 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
455 PyErr_NoMemory();
456 return -1;
457 }
458 new_size = (length + 1) * char_size;
459
460 data = (PyObject *)PyObject_REALLOC(data, new_size);
461 if (data == NULL) {
462 PyErr_NoMemory();
463 return -1;
464 }
465 _PyUnicode_DATA_ANY(unicode) = data;
466 if (share_wstr)
467 _PyUnicode_WSTR(unicode) = data;
468 _PyUnicode_LENGTH(unicode) = length;
469 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
470 if (share_wstr)
471 return 0;
472 }
473 if (_PyUnicode_WSTR(unicode) != NULL) {
474 assert(_PyUnicode_WSTR(unicode) != NULL);
475
476 oldstr = _PyUnicode_WSTR(unicode);
477 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
478 sizeof(Py_UNICODE) * (length + 1));
479 if (!_PyUnicode_WSTR(unicode)) {
480 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
481 PyErr_NoMemory();
482 return -1;
483 }
484 _PyUnicode_WSTR(unicode)[length] = 0;
485 _PyUnicode_WSTR_LENGTH(unicode) = length;
486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487 return 0;
488}
489
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490static PyObject*
491resize_copy(PyObject *unicode, Py_ssize_t length)
492{
493 Py_ssize_t copy_length;
494 if (PyUnicode_IS_COMPACT(unicode)) {
495 PyObject *copy;
496 assert(PyUnicode_IS_READY(unicode));
497
498 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
499 if (copy == NULL)
500 return NULL;
501
502 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
503 if (PyUnicode_CopyCharacters(copy, 0,
504 unicode, 0,
505 copy_length) < 0)
506 {
507 Py_DECREF(copy);
508 return NULL;
509 }
510 return copy;
511 } else {
512 assert(_PyUnicode_WSTR(unicode) != NULL);
513 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
514 PyUnicodeObject *w = _PyUnicode_New(length);
515 if (w == NULL)
516 return NULL;
517 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
518 copy_length = Py_MIN(copy_length, length);
519 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
520 copy_length);
521 return (PyObject*)w;
522 }
523}
524
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000526 Ux0000 terminated; some code (e.g. new_identifier)
527 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528
529 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531
532*/
533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534#ifdef Py_DEBUG
535int unicode_old_new_calls = 0;
536#endif
537
Alexander Belopolsky40018472011-02-26 01:02:56 +0000538static PyUnicodeObject *
539_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540{
541 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000545 if (length == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200547 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 }
549
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000550 /* Ensure we won't overflow the size. */
551 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
552 return (PyUnicodeObject *)PyErr_NoMemory();
553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200554 if (length < 0) {
555 PyErr_SetString(PyExc_SystemError,
556 "Negative size passed to _PyUnicode_New");
557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 }
559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560#ifdef Py_DEBUG
561 ++unicode_old_new_calls;
562#endif
563
564 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
565 if (unicode == NULL)
566 return NULL;
567 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
568 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
569 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000570 PyErr_NoMemory();
571 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200573
Jeremy Hyltond8082792003-09-16 19:41:39 +0000574 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000575 * the caller fails before initializing str -- unicode_resize()
576 * reads str[0], and the Keep-Alive optimization can keep memory
577 * allocated for str alive across a call to unicode_dealloc(unicode).
578 * We don't want unicode_resize to read uninitialized memory in
579 * that case.
580 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200581 _PyUnicode_WSTR(unicode)[0] = 0;
582 _PyUnicode_WSTR(unicode)[length] = 0;
583 _PyUnicode_WSTR_LENGTH(unicode) = length;
584 _PyUnicode_HASH(unicode) = -1;
585 _PyUnicode_STATE(unicode).interned = 0;
586 _PyUnicode_STATE(unicode).kind = 0;
587 _PyUnicode_STATE(unicode).compact = 0;
588 _PyUnicode_STATE(unicode).ready = 0;
589 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200590 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200592 _PyUnicode_UTF8(unicode) = NULL;
593 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000595
Benjamin Peterson29060642009-01-31 22:14:21 +0000596 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000597 /* XXX UNREF/NEWREF interface should be more symmetrical */
598 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000599 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000600 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602}
603
Victor Stinnerf42dc442011-10-02 23:33:16 +0200604static const char*
605unicode_kind_name(PyObject *unicode)
606{
Victor Stinner910337b2011-10-03 03:20:16 +0200607 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200608 if (!PyUnicode_IS_COMPACT(unicode))
609 {
610 if (!PyUnicode_IS_READY(unicode))
611 return "wstr";
612 switch(PyUnicode_KIND(unicode))
613 {
614 case PyUnicode_1BYTE_KIND:
615 if (PyUnicode_IS_COMPACT_ASCII(unicode))
616 return "legacy ascii";
617 else
618 return "legacy latin1";
619 case PyUnicode_2BYTE_KIND:
620 return "legacy UCS2";
621 case PyUnicode_4BYTE_KIND:
622 return "legacy UCS4";
623 default:
624 return "<legacy invalid kind>";
625 }
626 }
627 assert(PyUnicode_IS_READY(unicode));
628 switch(PyUnicode_KIND(unicode))
629 {
630 case PyUnicode_1BYTE_KIND:
631 if (PyUnicode_IS_COMPACT_ASCII(unicode))
632 return "ascii";
633 else
634 return "compact latin1";
635 case PyUnicode_2BYTE_KIND:
636 return "compact UCS2";
637 case PyUnicode_4BYTE_KIND:
638 return "compact UCS4";
639 default:
640 return "<invalid compact kind>";
641 }
642}
643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644#ifdef Py_DEBUG
645int unicode_new_new_calls = 0;
646
647/* Functions wrapping macros for use in debugger */
648char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200649 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650}
651
652void *_PyUnicode_compact_data(void *unicode) {
653 return _PyUnicode_COMPACT_DATA(unicode);
654}
655void *_PyUnicode_data(void *unicode){
656 printf("obj %p\n", unicode);
657 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
658 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
659 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
660 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
661 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
662 return PyUnicode_DATA(unicode);
663}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200664
665void
666_PyUnicode_Dump(PyObject *op)
667{
668 PyASCIIObject *ascii = (PyASCIIObject *)op;
669 printf("%s: len=%zu, wstr=%p",
670 unicode_kind_name(op),
671 ascii->length,
672 ascii->wstr);
673 if (!ascii->state.ascii) {
674 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
675 printf(" (%zu), utf8=%p (%zu)",
676 compact->wstr_length,
677 compact->utf8,
678 compact->utf8_length);
679 }
680 if (!ascii->state.compact) {
681 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
682 printf(", data=%p",
683 unicode->data.any);
684 }
685 printf("\n");
686}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687#endif
688
689PyObject *
690PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
691{
692 PyObject *obj;
693 PyCompactUnicodeObject *unicode;
694 void *data;
695 int kind_state;
696 int is_sharing = 0, is_ascii = 0;
697 Py_ssize_t char_size;
698 Py_ssize_t struct_size;
699
700 /* Optimization for empty strings */
701 if (size == 0 && unicode_empty != NULL) {
702 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200703 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 }
705
706#ifdef Py_DEBUG
707 ++unicode_new_new_calls;
708#endif
709
710 struct_size = sizeof(PyCompactUnicodeObject);
711 if (maxchar < 128) {
712 kind_state = PyUnicode_1BYTE_KIND;
713 char_size = 1;
714 is_ascii = 1;
715 struct_size = sizeof(PyASCIIObject);
716 }
717 else if (maxchar < 256) {
718 kind_state = PyUnicode_1BYTE_KIND;
719 char_size = 1;
720 }
721 else if (maxchar < 65536) {
722 kind_state = PyUnicode_2BYTE_KIND;
723 char_size = 2;
724 if (sizeof(wchar_t) == 2)
725 is_sharing = 1;
726 }
727 else {
728 kind_state = PyUnicode_4BYTE_KIND;
729 char_size = 4;
730 if (sizeof(wchar_t) == 4)
731 is_sharing = 1;
732 }
733
734 /* Ensure we won't overflow the size. */
735 if (size < 0) {
736 PyErr_SetString(PyExc_SystemError,
737 "Negative size passed to PyUnicode_New");
738 return NULL;
739 }
740 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
741 return PyErr_NoMemory();
742
743 /* Duplicated allocation code from _PyObject_New() instead of a call to
744 * PyObject_New() so we are able to allocate space for the object and
745 * it's data buffer.
746 */
747 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
748 if (obj == NULL)
749 return PyErr_NoMemory();
750 obj = PyObject_INIT(obj, &PyUnicode_Type);
751 if (obj == NULL)
752 return NULL;
753
754 unicode = (PyCompactUnicodeObject *)obj;
755 if (is_ascii)
756 data = ((PyASCIIObject*)obj) + 1;
757 else
758 data = unicode + 1;
759 _PyUnicode_LENGTH(unicode) = size;
760 _PyUnicode_HASH(unicode) = -1;
761 _PyUnicode_STATE(unicode).interned = 0;
762 _PyUnicode_STATE(unicode).kind = kind_state;
763 _PyUnicode_STATE(unicode).compact = 1;
764 _PyUnicode_STATE(unicode).ready = 1;
765 _PyUnicode_STATE(unicode).ascii = is_ascii;
766 if (is_ascii) {
767 ((char*)data)[size] = 0;
768 _PyUnicode_WSTR(unicode) = NULL;
769 }
770 else if (kind_state == PyUnicode_1BYTE_KIND) {
771 ((char*)data)[size] = 0;
772 _PyUnicode_WSTR(unicode) = NULL;
773 _PyUnicode_WSTR_LENGTH(unicode) = 0;
774 unicode->utf8_length = 0;
775 unicode->utf8 = NULL;
776 }
777 else {
778 unicode->utf8 = NULL;
779 if (kind_state == PyUnicode_2BYTE_KIND)
780 ((Py_UCS2*)data)[size] = 0;
781 else /* kind_state == PyUnicode_4BYTE_KIND */
782 ((Py_UCS4*)data)[size] = 0;
783 if (is_sharing) {
784 _PyUnicode_WSTR_LENGTH(unicode) = size;
785 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
786 }
787 else {
788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
789 _PyUnicode_WSTR(unicode) = NULL;
790 }
791 }
792 return obj;
793}
794
795#if SIZEOF_WCHAR_T == 2
796/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
797 will decode surrogate pairs, the other conversions are implemented as macros
798 for efficency.
799
800 This function assumes that unicode can hold one more code point than wstr
801 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200802static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
804 PyUnicodeObject *unicode)
805{
806 const wchar_t *iter;
807 Py_UCS4 *ucs4_out;
808
Victor Stinner910337b2011-10-03 03:20:16 +0200809 assert(unicode != NULL);
810 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200811 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
812 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
813
814 for (iter = begin; iter < end; ) {
815 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
816 _PyUnicode_GET_LENGTH(unicode)));
817 if (*iter >= 0xD800 && *iter <= 0xDBFF
818 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
819 {
820 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
821 iter += 2;
822 }
823 else {
824 *ucs4_out++ = *iter;
825 iter++;
826 }
827 }
828 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
829 _PyUnicode_GET_LENGTH(unicode)));
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831}
832#endif
833
Victor Stinnercd9950f2011-10-02 00:34:53 +0200834static int
835_PyUnicode_Dirty(PyObject *unicode)
836{
Victor Stinner910337b2011-10-03 03:20:16 +0200837 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200838 if (Py_REFCNT(unicode) != 1) {
839 PyErr_SetString(PyExc_ValueError,
840 "Cannot modify a string having more than 1 reference");
841 return -1;
842 }
843 _PyUnicode_DIRTY(unicode);
844 return 0;
845}
846
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200847Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
849 PyObject *from, Py_ssize_t from_start,
850 Py_ssize_t how_many)
851{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200852 unsigned int from_kind, to_kind;
853 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854
Victor Stinnerb1536152011-09-30 02:26:10 +0200855 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
856 PyErr_BadInternalCall();
857 return -1;
858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859
860 if (PyUnicode_READY(from))
861 return -1;
862 if (PyUnicode_READY(to))
863 return -1;
864
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200865 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200866 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
867 PyErr_Format(PyExc_ValueError,
868 "Cannot write %zi characters at %zi "
869 "in a string of %zi characters",
870 how_many, to_start, PyUnicode_GET_LENGTH(to));
871 return -1;
872 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200873 if (how_many == 0)
874 return 0;
875
Victor Stinnercd9950f2011-10-02 00:34:53 +0200876 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200877 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200880 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200882 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883
Victor Stinnerf42dc442011-10-02 23:33:16 +0200884 if (from_kind == to_kind
885 /* deny latin1 => ascii */
886 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
887 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200888 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200889 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200890 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200891 + PyUnicode_KIND_SIZE(from_kind, from_start),
892 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200894 else if (from_kind == PyUnicode_1BYTE_KIND
895 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200896 {
897 _PyUnicode_CONVERT_BYTES(
898 Py_UCS1, Py_UCS2,
899 PyUnicode_1BYTE_DATA(from) + from_start,
900 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
901 PyUnicode_2BYTE_DATA(to) + to_start
902 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200903 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200904 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200905 && to_kind == PyUnicode_4BYTE_KIND)
906 {
907 _PyUnicode_CONVERT_BYTES(
908 Py_UCS1, Py_UCS4,
909 PyUnicode_1BYTE_DATA(from) + from_start,
910 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
911 PyUnicode_4BYTE_DATA(to) + to_start
912 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200913 }
914 else if (from_kind == PyUnicode_2BYTE_KIND
915 && to_kind == PyUnicode_4BYTE_KIND)
916 {
917 _PyUnicode_CONVERT_BYTES(
918 Py_UCS2, Py_UCS4,
919 PyUnicode_2BYTE_DATA(from) + from_start,
920 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
921 PyUnicode_4BYTE_DATA(to) + to_start
922 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200923 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200924 else {
925 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200926
927 /* check if max_char(from substring) <= max_char(to) */
928 if (from_kind > to_kind
929 /* latin1 => ascii */
930 || (PyUnicode_IS_COMPACT_ASCII(to)
931 && to_kind == PyUnicode_1BYTE_KIND
932 && !PyUnicode_IS_COMPACT_ASCII(from)))
933 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200934 /* slow path to check for character overflow */
935 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
936 Py_UCS4 ch, maxchar;
937 Py_ssize_t i;
938
939 maxchar = 0;
940 invalid_kinds = 0;
941 for (i=0; i < how_many; i++) {
942 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
943 if (ch > maxchar) {
944 maxchar = ch;
945 if (maxchar > to_maxchar) {
946 invalid_kinds = 1;
947 break;
948 }
949 }
950 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
951 }
952 }
953 else
954 invalid_kinds = 1;
955 if (invalid_kinds) {
956 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200957 "Cannot copy %s characters "
958 "into a string of %s characters",
959 unicode_kind_name(from),
960 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 return -1;
962 }
963 }
964 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965}
966
Victor Stinner17222162011-09-28 22:15:37 +0200967/* Find the maximum code point and count the number of surrogate pairs so a
968 correct string length can be computed before converting a string to UCS4.
969 This function counts single surrogates as a character and not as a pair.
970
971 Return 0 on success, or -1 on error. */
972static int
973find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
974 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975{
976 const wchar_t *iter;
977
Victor Stinnerc53be962011-10-02 21:33:54 +0200978 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 if (num_surrogates == NULL || maxchar == NULL) {
980 PyErr_SetString(PyExc_SystemError,
981 "unexpected NULL arguments to "
982 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
983 return -1;
984 }
985
986 *num_surrogates = 0;
987 *maxchar = 0;
988
989 for (iter = begin; iter < end; ) {
990 if (*iter > *maxchar)
991 *maxchar = *iter;
992#if SIZEOF_WCHAR_T == 2
993 if (*iter >= 0xD800 && *iter <= 0xDBFF
994 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
995 {
996 Py_UCS4 surrogate_val;
997 surrogate_val = (((iter[0] & 0x3FF)<<10)
998 | (iter[1] & 0x3FF)) + 0x10000;
999 ++(*num_surrogates);
1000 if (surrogate_val > *maxchar)
1001 *maxchar = surrogate_val;
1002 iter += 2;
1003 }
1004 else
1005 iter++;
1006#else
1007 iter++;
1008#endif
1009 }
1010 return 0;
1011}
1012
1013#ifdef Py_DEBUG
1014int unicode_ready_calls = 0;
1015#endif
1016
1017int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001018_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001020 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 wchar_t *end;
1022 Py_UCS4 maxchar = 0;
1023 Py_ssize_t num_surrogates;
1024#if SIZEOF_WCHAR_T == 2
1025 Py_ssize_t length_wo_surrogates;
1026#endif
1027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001029 strings were created using _PyObject_New() and where no canonical
1030 representation (the str field) has been set yet aka strings
1031 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001032 assert(_PyUnicode_CHECK(unicode));
1033 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001035 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001036 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001037 /* Actually, it should neither be interned nor be anything else: */
1038 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039
1040#ifdef Py_DEBUG
1041 ++unicode_ready_calls;
1042#endif
1043
1044 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001045 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001046 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048
1049 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001050 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1051 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 PyErr_NoMemory();
1053 return -1;
1054 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001055 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_WSTR(unicode), end,
1057 PyUnicode_1BYTE_DATA(unicode));
1058 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1059 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1060 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1061 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001062 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001063 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 }
1065 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001066 _PyUnicode_UTF8(unicode) = NULL;
1067 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 }
1069 PyObject_FREE(_PyUnicode_WSTR(unicode));
1070 _PyUnicode_WSTR(unicode) = NULL;
1071 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1072 }
1073 /* In this case we might have to convert down from 4-byte native
1074 wchar_t to 2-byte unicode. */
1075 else if (maxchar < 65536) {
1076 assert(num_surrogates == 0 &&
1077 "FindMaxCharAndNumSurrogatePairs() messed up");
1078
Victor Stinner506f5922011-09-28 22:34:18 +02001079#if SIZEOF_WCHAR_T == 2
1080 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001081 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001082 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1083 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1084 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001085 _PyUnicode_UTF8(unicode) = NULL;
1086 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001087#else
1088 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001089 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001090 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001091 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001092 PyErr_NoMemory();
1093 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 }
Victor Stinner506f5922011-09-28 22:34:18 +02001095 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1096 _PyUnicode_WSTR(unicode), end,
1097 PyUnicode_2BYTE_DATA(unicode));
1098 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1099 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1100 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001103 PyObject_FREE(_PyUnicode_WSTR(unicode));
1104 _PyUnicode_WSTR(unicode) = NULL;
1105 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1106#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 }
1108 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1109 else {
1110#if SIZEOF_WCHAR_T == 2
1111 /* in case the native representation is 2-bytes, we need to allocate a
1112 new normalized 4-byte version. */
1113 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001114 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1115 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 PyErr_NoMemory();
1117 return -1;
1118 }
1119 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1120 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001121 _PyUnicode_UTF8(unicode) = NULL;
1122 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinnerc53be962011-10-02 21:33:54 +02001123 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 PyObject_FREE(_PyUnicode_WSTR(unicode));
1125 _PyUnicode_WSTR(unicode) = NULL;
1126 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1127#else
1128 assert(num_surrogates == 0);
1129
Victor Stinnerc3c74152011-10-02 20:39:55 +02001130 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001132 _PyUnicode_UTF8(unicode) = NULL;
1133 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1135#endif
1136 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1137 }
1138 _PyUnicode_STATE(unicode).ready = 1;
1139 return 0;
1140}
1141
Alexander Belopolsky40018472011-02-26 01:02:56 +00001142static void
1143unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144{
Walter Dörwald16807132007-05-25 13:52:07 +00001145 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 case SSTATE_NOT_INTERNED:
1147 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001148
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 case SSTATE_INTERNED_MORTAL:
1150 /* revive dead object temporarily for DelItem */
1151 Py_REFCNT(unicode) = 3;
1152 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1153 Py_FatalError(
1154 "deletion of interned string failed");
1155 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001156
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 case SSTATE_INTERNED_IMMORTAL:
1158 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001159
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 default:
1161 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001162 }
1163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 if (_PyUnicode_WSTR(unicode) &&
1165 (!PyUnicode_IS_READY(unicode) ||
1166 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1167 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001168 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
1171 if (PyUnicode_IS_COMPACT(unicode)) {
1172 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 }
1174 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001175 if (_PyUnicode_DATA_ANY(unicode))
1176 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001177 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 }
1179}
1180
Alexander Belopolsky40018472011-02-26 01:02:56 +00001181static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001182unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001183{
Victor Stinnera3be6132011-10-03 02:16:37 +02001184 Py_ssize_t len;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185 if (Py_REFCNT(unicode) != 1)
1186 return 0;
1187 if (PyUnicode_CHECK_INTERNED(unicode))
1188 return 0;
1189 if (unicode == unicode_empty)
1190 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001191 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1192 len = PyUnicode_WSTR_LENGTH(unicode);
1193 else
1194 len = PyUnicode_GET_LENGTH(unicode);
1195 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001196 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001197 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001198 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001199 else
1200 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001201 if (ch < 256 && unicode_latin1[ch] == unicode)
1202 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001203 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001204 /* FIXME: reenable resize_inplace */
1205 if (!PyUnicode_IS_COMPACT(unicode))
1206 return 0;
1207 return 1;
1208}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001209
Victor Stinnerfe226c02011-10-03 03:52:20 +02001210static int
1211unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1212{
1213 PyObject *unicode;
1214 Py_ssize_t old_length;
1215
1216 assert(p_unicode != NULL);
1217 unicode = *p_unicode;
1218
1219 assert(unicode != NULL);
1220 assert(PyUnicode_Check(unicode));
1221 assert(0 <= length);
1222
Victor Stinner910337b2011-10-03 03:20:16 +02001223 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001224 old_length = PyUnicode_WSTR_LENGTH(unicode);
1225 else
1226 old_length = PyUnicode_GET_LENGTH(unicode);
1227 if (old_length == length)
1228 return 0;
1229
1230 /* FIXME: really create a new object? */
1231 if (!unicode_resizable(unicode)) {
1232 PyObject *copy = resize_copy(unicode, length);
1233 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001235 Py_DECREF(*p_unicode);
1236 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001237 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001238 }
1239
Victor Stinnerfe226c02011-10-03 03:52:20 +02001240 if (PyUnicode_IS_COMPACT(unicode)) {
1241 *p_unicode = resize_compact(unicode, length);
1242 if (*p_unicode == NULL)
1243 return -1;
1244 return 0;
1245 } else
1246 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001247}
1248
Alexander Belopolsky40018472011-02-26 01:02:56 +00001249int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001250PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001251{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001252 PyObject *unicode;
1253 if (p_unicode == NULL) {
1254 PyErr_BadInternalCall();
1255 return -1;
1256 }
1257 unicode = *p_unicode;
1258 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1259 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1260 {
1261 PyErr_BadInternalCall();
1262 return -1;
1263 }
1264 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001265}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267static PyObject*
1268get_latin1_char(unsigned char ch)
1269{
Victor Stinnera464fc12011-10-02 20:39:30 +02001270 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001272 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 if (!unicode)
1274 return NULL;
1275 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1276 unicode_latin1[ch] = unicode;
1277 }
1278 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001279 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280}
1281
Alexander Belopolsky40018472011-02-26 01:02:56 +00001282PyObject *
1283PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284{
1285 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286 Py_UCS4 maxchar = 0;
1287 Py_ssize_t num_surrogates;
1288
1289 if (u == NULL)
1290 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001292 /* If the Unicode data is known at construction time, we can apply
1293 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295 /* Optimization for empty strings */
1296 if (size == 0 && unicode_empty != NULL) {
1297 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001298 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001299 }
Tim Petersced69f82003-09-16 20:30:58 +00001300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301 /* Single character Unicode objects in the Latin-1 range are
1302 shared when using this constructor */
1303 if (size == 1 && *u < 256)
1304 return get_latin1_char((unsigned char)*u);
1305
1306 /* If not empty and not single character, copy the Unicode data
1307 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001308 if (find_maxchar_surrogates(u, u + size,
1309 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 return NULL;
1311
1312 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1313 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 if (!unicode)
1315 return NULL;
1316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 switch (PyUnicode_KIND(unicode)) {
1318 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001319 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1321 break;
1322 case PyUnicode_2BYTE_KIND:
1323#if Py_UNICODE_SIZE == 2
1324 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1325#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001326 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1328#endif
1329 break;
1330 case PyUnicode_4BYTE_KIND:
1331#if SIZEOF_WCHAR_T == 2
1332 /* This is the only case which has to process surrogates, thus
1333 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001334 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335#else
1336 assert(num_surrogates == 0);
1337 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1338#endif
1339 break;
1340 default:
1341 assert(0 && "Impossible state");
1342 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343
1344 return (PyObject *)unicode;
1345}
1346
Alexander Belopolsky40018472011-02-26 01:02:56 +00001347PyObject *
1348PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001349{
1350 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001351
Benjamin Peterson14339b62009-01-31 16:36:08 +00001352 if (size < 0) {
1353 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001354 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001355 return NULL;
1356 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001357
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001358 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001359 some optimizations which share commonly used objects.
1360 Also, this means the input must be UTF-8, so fall back to the
1361 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001362 if (u != NULL) {
1363
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 /* Optimization for empty strings */
1365 if (size == 0 && unicode_empty != NULL) {
1366 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001367 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001369
1370 /* Single characters are shared when using this constructor.
1371 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 if (size == 1 && Py_CHARMASK(*u) < 128)
1373 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001374
1375 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001376 }
1377
Walter Dörwald55507312007-05-18 13:12:10 +00001378 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001379 if (!unicode)
1380 return NULL;
1381
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001382 return (PyObject *)unicode;
1383}
1384
Alexander Belopolsky40018472011-02-26 01:02:56 +00001385PyObject *
1386PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001387{
1388 size_t size = strlen(u);
1389 if (size > PY_SSIZE_T_MAX) {
1390 PyErr_SetString(PyExc_OverflowError, "input too long");
1391 return NULL;
1392 }
1393
1394 return PyUnicode_FromStringAndSize(u, size);
1395}
1396
Victor Stinnere57b1c02011-09-28 22:20:48 +02001397static PyObject*
1398_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 PyObject *res;
1401 unsigned char max = 127;
1402 Py_ssize_t i;
1403 for (i = 0; i < size; i++) {
1404 if (u[i] & 0x80) {
1405 max = 255;
1406 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001407 }
1408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 res = PyUnicode_New(size, max);
1410 if (!res)
1411 return NULL;
1412 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1413 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001414}
1415
Victor Stinnere57b1c02011-09-28 22:20:48 +02001416static PyObject*
1417_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418{
1419 PyObject *res;
1420 Py_UCS2 max = 0;
1421 Py_ssize_t i;
1422 for (i = 0; i < size; i++)
1423 if (u[i] > max)
1424 max = u[i];
1425 res = PyUnicode_New(size, max);
1426 if (!res)
1427 return NULL;
1428 if (max >= 256)
1429 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1430 else
1431 for (i = 0; i < size; i++)
1432 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1433 return res;
1434}
1435
Victor Stinnere57b1c02011-09-28 22:20:48 +02001436static PyObject*
1437_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438{
1439 PyObject *res;
1440 Py_UCS4 max = 0;
1441 Py_ssize_t i;
1442 for (i = 0; i < size; i++)
1443 if (u[i] > max)
1444 max = u[i];
1445 res = PyUnicode_New(size, max);
1446 if (!res)
1447 return NULL;
1448 if (max >= 0x10000)
1449 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1450 else {
1451 int kind = PyUnicode_KIND(res);
1452 void *data = PyUnicode_DATA(res);
1453 for (i = 0; i < size; i++)
1454 PyUnicode_WRITE(kind, data, i, u[i]);
1455 }
1456 return res;
1457}
1458
1459PyObject*
1460PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1461{
1462 switch(kind) {
1463 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001464 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001466 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001468 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001470 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 return NULL;
1472}
1473
Victor Stinner034f6cf2011-09-30 02:26:44 +02001474PyObject*
1475PyUnicode_Copy(PyObject *unicode)
1476{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001477 Py_ssize_t size;
1478 PyObject *copy;
1479 void *data;
1480
Victor Stinner034f6cf2011-09-30 02:26:44 +02001481 if (!PyUnicode_Check(unicode)) {
1482 PyErr_BadInternalCall();
1483 return NULL;
1484 }
1485 if (PyUnicode_READY(unicode))
1486 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001487
1488 size = PyUnicode_GET_LENGTH(unicode);
1489 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1490 if (!copy)
1491 return NULL;
1492 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1493
1494 data = PyUnicode_DATA(unicode);
1495 switch (PyUnicode_KIND(unicode))
1496 {
1497 case PyUnicode_1BYTE_KIND:
1498 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1499 break;
1500 case PyUnicode_2BYTE_KIND:
1501 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1502 break;
1503 case PyUnicode_4BYTE_KIND:
1504 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1505 break;
1506 default:
1507 assert(0);
1508 break;
1509 }
1510 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001511}
1512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001513
Victor Stinnerbc603d12011-10-02 01:00:40 +02001514/* Widen Unicode objects to larger buffers. Don't write terminating null
1515 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516
1517void*
1518_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1519{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001520 Py_ssize_t len;
1521 void *result;
1522 unsigned int skind;
1523
1524 if (PyUnicode_READY(s))
1525 return NULL;
1526
1527 len = PyUnicode_GET_LENGTH(s);
1528 skind = PyUnicode_KIND(s);
1529 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1531 return NULL;
1532 }
1533 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001534 case PyUnicode_2BYTE_KIND:
1535 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1536 if (!result)
1537 return PyErr_NoMemory();
1538 assert(skind == PyUnicode_1BYTE_KIND);
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS1, Py_UCS2,
1541 PyUnicode_1BYTE_DATA(s),
1542 PyUnicode_1BYTE_DATA(s) + len,
1543 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001545 case PyUnicode_4BYTE_KIND:
1546 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1547 if (!result)
1548 return PyErr_NoMemory();
1549 if (skind == PyUnicode_2BYTE_KIND) {
1550 _PyUnicode_CONVERT_BYTES(
1551 Py_UCS2, Py_UCS4,
1552 PyUnicode_2BYTE_DATA(s),
1553 PyUnicode_2BYTE_DATA(s) + len,
1554 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001556 else {
1557 assert(skind == PyUnicode_1BYTE_KIND);
1558 _PyUnicode_CONVERT_BYTES(
1559 Py_UCS1, Py_UCS4,
1560 PyUnicode_1BYTE_DATA(s),
1561 PyUnicode_1BYTE_DATA(s) + len,
1562 result);
1563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001565 default:
1566 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001568 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 return NULL;
1570}
1571
1572static Py_UCS4*
1573as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1574 int copy_null)
1575{
1576 int kind;
1577 void *data;
1578 Py_ssize_t len, targetlen;
1579 if (PyUnicode_READY(string) == -1)
1580 return NULL;
1581 kind = PyUnicode_KIND(string);
1582 data = PyUnicode_DATA(string);
1583 len = PyUnicode_GET_LENGTH(string);
1584 targetlen = len;
1585 if (copy_null)
1586 targetlen++;
1587 if (!target) {
1588 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1589 PyErr_NoMemory();
1590 return NULL;
1591 }
1592 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1593 if (!target) {
1594 PyErr_NoMemory();
1595 return NULL;
1596 }
1597 }
1598 else {
1599 if (targetsize < targetlen) {
1600 PyErr_Format(PyExc_SystemError,
1601 "string is longer than the buffer");
1602 if (copy_null && 0 < targetsize)
1603 target[0] = 0;
1604 return NULL;
1605 }
1606 }
1607 if (kind != PyUnicode_4BYTE_KIND) {
1608 Py_ssize_t i;
1609 for (i = 0; i < len; i++)
1610 target[i] = PyUnicode_READ(kind, data, i);
1611 }
1612 else
1613 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1614 if (copy_null)
1615 target[len] = 0;
1616 return target;
1617}
1618
1619Py_UCS4*
1620PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1621 int copy_null)
1622{
1623 if (target == NULL || targetsize < 1) {
1624 PyErr_BadInternalCall();
1625 return NULL;
1626 }
1627 return as_ucs4(string, target, targetsize, copy_null);
1628}
1629
1630Py_UCS4*
1631PyUnicode_AsUCS4Copy(PyObject *string)
1632{
1633 return as_ucs4(string, NULL, 0, 1);
1634}
1635
1636#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001637
Alexander Belopolsky40018472011-02-26 01:02:56 +00001638PyObject *
1639PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001642 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001644 PyErr_BadInternalCall();
1645 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 }
1647
Martin v. Löwis790465f2008-04-05 20:41:37 +00001648 if (size == -1) {
1649 size = wcslen(w);
1650 }
1651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653}
1654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001655#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001656
Walter Dörwald346737f2007-05-31 10:44:43 +00001657static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001658makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1659 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001661 *fmt++ = '%';
1662 if (width) {
1663 if (zeropad)
1664 *fmt++ = '0';
1665 fmt += sprintf(fmt, "%d", width);
1666 }
1667 if (precision)
1668 fmt += sprintf(fmt, ".%d", precision);
1669 if (longflag)
1670 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001671 else if (longlongflag) {
1672 /* longlongflag should only ever be nonzero on machines with
1673 HAVE_LONG_LONG defined */
1674#ifdef HAVE_LONG_LONG
1675 char *f = PY_FORMAT_LONG_LONG;
1676 while (*f)
1677 *fmt++ = *f++;
1678#else
1679 /* we shouldn't ever get here */
1680 assert(0);
1681 *fmt++ = 'l';
1682#endif
1683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001684 else if (size_tflag) {
1685 char *f = PY_FORMAT_SIZE_T;
1686 while (*f)
1687 *fmt++ = *f++;
1688 }
1689 *fmt++ = c;
1690 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001691}
1692
Victor Stinner96865452011-03-01 23:44:09 +00001693/* helper for PyUnicode_FromFormatV() */
1694
1695static const char*
1696parse_format_flags(const char *f,
1697 int *p_width, int *p_precision,
1698 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1699{
1700 int width, precision, longflag, longlongflag, size_tflag;
1701
1702 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1703 f++;
1704 width = 0;
1705 while (Py_ISDIGIT((unsigned)*f))
1706 width = (width*10) + *f++ - '0';
1707 precision = 0;
1708 if (*f == '.') {
1709 f++;
1710 while (Py_ISDIGIT((unsigned)*f))
1711 precision = (precision*10) + *f++ - '0';
1712 if (*f == '%') {
1713 /* "%.3%s" => f points to "3" */
1714 f--;
1715 }
1716 }
1717 if (*f == '\0') {
1718 /* bogus format "%.1" => go backward, f points to "1" */
1719 f--;
1720 }
1721 if (p_width != NULL)
1722 *p_width = width;
1723 if (p_precision != NULL)
1724 *p_precision = precision;
1725
1726 /* Handle %ld, %lu, %lld and %llu. */
1727 longflag = 0;
1728 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001729 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001730
1731 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001732 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001733 longflag = 1;
1734 ++f;
1735 }
1736#ifdef HAVE_LONG_LONG
1737 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001738 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001739 longlongflag = 1;
1740 f += 2;
1741 }
1742#endif
1743 }
1744 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001745 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001746 size_tflag = 1;
1747 ++f;
1748 }
1749 if (p_longflag != NULL)
1750 *p_longflag = longflag;
1751 if (p_longlongflag != NULL)
1752 *p_longlongflag = longlongflag;
1753 if (p_size_tflag != NULL)
1754 *p_size_tflag = size_tflag;
1755 return f;
1756}
1757
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001758/* maximum number of characters required for output of %ld. 21 characters
1759 allows for 64-bit integers (in decimal) and an optional sign. */
1760#define MAX_LONG_CHARS 21
1761/* maximum number of characters required for output of %lld.
1762 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1763 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1764#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1765
Walter Dörwaldd2034312007-05-18 16:29:38 +00001766PyObject *
1767PyUnicode_FromFormatV(const char *format, va_list vargs)
1768{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001769 va_list count;
1770 Py_ssize_t callcount = 0;
1771 PyObject **callresults = NULL;
1772 PyObject **callresult = NULL;
1773 Py_ssize_t n = 0;
1774 int width = 0;
1775 int precision = 0;
1776 int zeropad;
1777 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001780 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1782 Py_UCS4 argmaxchar;
1783 Py_ssize_t numbersize = 0;
1784 char *numberresults = NULL;
1785 char *numberresult = NULL;
1786 Py_ssize_t i;
1787 int kind;
1788 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001789
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001790 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001791 /* step 1: count the number of %S/%R/%A/%s format specifications
1792 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1793 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 * result in an array)
1795 * also esimate a upper bound for all the number formats in the string,
1796 * numbers will be formated in step 3 and be keept in a '\0'-separated
1797 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 for (f = format; *f; f++) {
1799 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001800 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1802 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1803 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1804 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001807#ifdef HAVE_LONG_LONG
1808 if (longlongflag) {
1809 if (width < MAX_LONG_LONG_CHARS)
1810 width = MAX_LONG_LONG_CHARS;
1811 }
1812 else
1813#endif
1814 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1815 including sign. Decimal takes the most space. This
1816 isn't enough for octal. If a width is specified we
1817 need more (which we allocate later). */
1818 if (width < MAX_LONG_CHARS)
1819 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820
1821 /* account for the size + '\0' to separate numbers
1822 inside of the numberresults buffer */
1823 numbersize += (width + 1);
1824 }
1825 }
1826 else if ((unsigned char)*f > 127) {
1827 PyErr_Format(PyExc_ValueError,
1828 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1829 "string, got a non-ASCII byte: 0x%02x",
1830 (unsigned char)*f);
1831 return NULL;
1832 }
1833 }
1834 /* step 2: allocate memory for the results of
1835 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1836 if (callcount) {
1837 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1838 if (!callresults) {
1839 PyErr_NoMemory();
1840 return NULL;
1841 }
1842 callresult = callresults;
1843 }
1844 /* step 2.5: allocate memory for the results of formating numbers */
1845 if (numbersize) {
1846 numberresults = PyObject_Malloc(numbersize);
1847 if (!numberresults) {
1848 PyErr_NoMemory();
1849 goto fail;
1850 }
1851 numberresult = numberresults;
1852 }
1853
1854 /* step 3: format numbers and figure out how large a buffer we need */
1855 for (f = format; *f; f++) {
1856 if (*f == '%') {
1857 const char* p;
1858 int longflag;
1859 int longlongflag;
1860 int size_tflag;
1861 int numprinted;
1862
1863 p = f;
1864 zeropad = (f[1] == '0');
1865 f = parse_format_flags(f, &width, &precision,
1866 &longflag, &longlongflag, &size_tflag);
1867 switch (*f) {
1868 case 'c':
1869 {
1870 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001871 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 n++;
1873 break;
1874 }
1875 case '%':
1876 n++;
1877 break;
1878 case 'i':
1879 case 'd':
1880 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1881 width, precision, *f);
1882 if (longflag)
1883 numprinted = sprintf(numberresult, fmt,
1884 va_arg(count, long));
1885#ifdef HAVE_LONG_LONG
1886 else if (longlongflag)
1887 numprinted = sprintf(numberresult, fmt,
1888 va_arg(count, PY_LONG_LONG));
1889#endif
1890 else if (size_tflag)
1891 numprinted = sprintf(numberresult, fmt,
1892 va_arg(count, Py_ssize_t));
1893 else
1894 numprinted = sprintf(numberresult, fmt,
1895 va_arg(count, int));
1896 n += numprinted;
1897 /* advance by +1 to skip over the '\0' */
1898 numberresult += (numprinted + 1);
1899 assert(*(numberresult - 1) == '\0');
1900 assert(*(numberresult - 2) != '\0');
1901 assert(numprinted >= 0);
1902 assert(numberresult <= numberresults + numbersize);
1903 break;
1904 case 'u':
1905 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1906 width, precision, 'u');
1907 if (longflag)
1908 numprinted = sprintf(numberresult, fmt,
1909 va_arg(count, unsigned long));
1910#ifdef HAVE_LONG_LONG
1911 else if (longlongflag)
1912 numprinted = sprintf(numberresult, fmt,
1913 va_arg(count, unsigned PY_LONG_LONG));
1914#endif
1915 else if (size_tflag)
1916 numprinted = sprintf(numberresult, fmt,
1917 va_arg(count, size_t));
1918 else
1919 numprinted = sprintf(numberresult, fmt,
1920 va_arg(count, unsigned int));
1921 n += numprinted;
1922 numberresult += (numprinted + 1);
1923 assert(*(numberresult - 1) == '\0');
1924 assert(*(numberresult - 2) != '\0');
1925 assert(numprinted >= 0);
1926 assert(numberresult <= numberresults + numbersize);
1927 break;
1928 case 'x':
1929 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1930 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1931 n += numprinted;
1932 numberresult += (numprinted + 1);
1933 assert(*(numberresult - 1) == '\0');
1934 assert(*(numberresult - 2) != '\0');
1935 assert(numprinted >= 0);
1936 assert(numberresult <= numberresults + numbersize);
1937 break;
1938 case 'p':
1939 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1940 /* %p is ill-defined: ensure leading 0x. */
1941 if (numberresult[1] == 'X')
1942 numberresult[1] = 'x';
1943 else if (numberresult[1] != 'x') {
1944 memmove(numberresult + 2, numberresult,
1945 strlen(numberresult) + 1);
1946 numberresult[0] = '0';
1947 numberresult[1] = 'x';
1948 numprinted += 2;
1949 }
1950 n += numprinted;
1951 numberresult += (numprinted + 1);
1952 assert(*(numberresult - 1) == '\0');
1953 assert(*(numberresult - 2) != '\0');
1954 assert(numprinted >= 0);
1955 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001956 break;
1957 case 's':
1958 {
1959 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001960 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001961 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1962 if (!str)
1963 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 /* since PyUnicode_DecodeUTF8 returns already flexible
1965 unicode objects, there is no need to call ready on them */
1966 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001967 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001969 /* Remember the str and switch to the next slot */
1970 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001971 break;
1972 }
1973 case 'U':
1974 {
1975 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001976 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (PyUnicode_READY(obj) == -1)
1978 goto fail;
1979 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001980 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001982 break;
1983 }
1984 case 'V':
1985 {
1986 PyObject *obj = va_arg(count, PyObject *);
1987 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001988 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001989 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02001990 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001991 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992 if (PyUnicode_READY(obj) == -1)
1993 goto fail;
1994 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001995 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001997 *callresult++ = NULL;
1998 }
1999 else {
2000 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2001 if (!str_obj)
2002 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002004 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002006 *callresult++ = str_obj;
2007 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002008 break;
2009 }
2010 case 'S':
2011 {
2012 PyObject *obj = va_arg(count, PyObject *);
2013 PyObject *str;
2014 assert(obj);
2015 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002017 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002019 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002021 /* Remember the str and switch to the next slot */
2022 *callresult++ = str;
2023 break;
2024 }
2025 case 'R':
2026 {
2027 PyObject *obj = va_arg(count, PyObject *);
2028 PyObject *repr;
2029 assert(obj);
2030 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002032 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002034 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002036 /* Remember the repr and switch to the next slot */
2037 *callresult++ = repr;
2038 break;
2039 }
2040 case 'A':
2041 {
2042 PyObject *obj = va_arg(count, PyObject *);
2043 PyObject *ascii;
2044 assert(obj);
2045 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002047 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002049 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002051 /* Remember the repr and switch to the next slot */
2052 *callresult++ = ascii;
2053 break;
2054 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 default:
2056 /* if we stumble upon an unknown
2057 formatting code, copy the rest of
2058 the format string to the output
2059 string. (we cannot just skip the
2060 code, since there's no way to know
2061 what's in the argument list) */
2062 n += strlen(p);
2063 goto expand;
2064 }
2065 } else
2066 n++;
2067 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002068 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002069 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002071 we don't have to resize the string.
2072 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002073 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002074 if (!string)
2075 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 kind = PyUnicode_KIND(string);
2077 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002078 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002079 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002082 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002083 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002084
2085 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002086 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2087 /* checking for == because the last argument could be a empty
2088 string, which causes i to point to end, the assert at the end of
2089 the loop */
2090 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002091
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 switch (*f) {
2093 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002094 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 const int ordinal = va_arg(vargs, int);
2096 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002097 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002098 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002099 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002100 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002101 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002103 case 'p':
2104 /* unused, since we already have the result */
2105 if (*f == 'p')
2106 (void) va_arg(vargs, void *);
2107 else
2108 (void) va_arg(vargs, int);
2109 /* extract the result from numberresults and append. */
2110 for (; *numberresult; ++i, ++numberresult)
2111 PyUnicode_WRITE(kind, data, i, *numberresult);
2112 /* skip over the separating '\0' */
2113 assert(*numberresult == '\0');
2114 numberresult++;
2115 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002116 break;
2117 case 's':
2118 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002119 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002120 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002121 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 size = PyUnicode_GET_LENGTH(*callresult);
2123 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002124 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2125 *callresult, 0,
2126 size) < 0)
2127 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002129 /* We're done with the unicode()/repr() => forget it */
2130 Py_DECREF(*callresult);
2131 /* switch to next unicode()/repr() result */
2132 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002133 break;
2134 }
2135 case 'U':
2136 {
2137 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 Py_ssize_t size;
2139 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2140 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002141 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2142 obj, 0,
2143 size) < 0)
2144 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002146 break;
2147 }
2148 case 'V':
2149 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002151 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002152 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002153 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 size = PyUnicode_GET_LENGTH(obj);
2155 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002156 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2157 obj, 0,
2158 size) < 0)
2159 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002161 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 size = PyUnicode_GET_LENGTH(*callresult);
2163 assert(PyUnicode_KIND(*callresult) <=
2164 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002165 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2166 *callresult,
2167 0, size) < 0)
2168 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002170 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002172 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002173 break;
2174 }
2175 case 'S':
2176 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002177 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002178 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002179 /* unused, since we already have the result */
2180 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002182 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2183 *callresult, 0,
2184 PyUnicode_GET_LENGTH(*callresult)) < 0)
2185 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002187 /* We're done with the unicode()/repr() => forget it */
2188 Py_DECREF(*callresult);
2189 /* switch to next unicode()/repr() result */
2190 ++callresult;
2191 break;
2192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002193 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 break;
2196 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 for (; *p; ++p, ++i)
2198 PyUnicode_WRITE(kind, data, i, *p);
2199 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 goto end;
2201 }
Victor Stinner1205f272010-09-11 00:54:47 +00002202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 else {
2204 assert(i < PyUnicode_GET_LENGTH(string));
2205 PyUnicode_WRITE(kind, data, i++, *f);
2206 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002209
Benjamin Peterson29060642009-01-31 22:14:21 +00002210 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 if (callresults)
2212 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 if (numberresults)
2214 PyObject_Free(numberresults);
2215 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002216 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 if (callresults) {
2218 PyObject **callresult2 = callresults;
2219 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002220 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 ++callresult2;
2222 }
2223 PyObject_Free(callresults);
2224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 if (numberresults)
2226 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002227 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002228}
2229
Walter Dörwaldd2034312007-05-18 16:29:38 +00002230PyObject *
2231PyUnicode_FromFormat(const char *format, ...)
2232{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002233 PyObject* ret;
2234 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002235
2236#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002237 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002238#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002240#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002241 ret = PyUnicode_FromFormatV(format, vargs);
2242 va_end(vargs);
2243 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002244}
2245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246#ifdef HAVE_WCHAR_H
2247
Victor Stinner5593d8a2010-10-02 11:11:27 +00002248/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2249 convert a Unicode object to a wide character string.
2250
Victor Stinnerd88d9832011-09-06 02:00:05 +02002251 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002252 character) required to convert the unicode object. Ignore size argument.
2253
Victor Stinnerd88d9832011-09-06 02:00:05 +02002254 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002255 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002256 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002257static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002258unicode_aswidechar(PyUnicodeObject *unicode,
2259 wchar_t *w,
2260 Py_ssize_t size)
2261{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002262 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 const wchar_t *wstr;
2264
2265 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2266 if (wstr == NULL)
2267 return -1;
2268
Victor Stinner5593d8a2010-10-02 11:11:27 +00002269 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002270 if (size > res)
2271 size = res + 1;
2272 else
2273 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002275 return res;
2276 }
2277 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002279}
2280
2281Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002282PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002283 wchar_t *w,
2284 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002285{
2286 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002287 PyErr_BadInternalCall();
2288 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002289 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002290 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291}
2292
Victor Stinner137c34c2010-09-29 10:25:54 +00002293wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002294PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002295 Py_ssize_t *size)
2296{
2297 wchar_t* buffer;
2298 Py_ssize_t buflen;
2299
2300 if (unicode == NULL) {
2301 PyErr_BadInternalCall();
2302 return NULL;
2303 }
2304
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002305 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002306 if (buflen == -1)
2307 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002308 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002309 PyErr_NoMemory();
2310 return NULL;
2311 }
2312
Victor Stinner137c34c2010-09-29 10:25:54 +00002313 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2314 if (buffer == NULL) {
2315 PyErr_NoMemory();
2316 return NULL;
2317 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002318 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319 if (buflen == -1)
2320 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002321 if (size != NULL)
2322 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002323 return buffer;
2324}
2325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002326#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327
Alexander Belopolsky40018472011-02-26 01:02:56 +00002328PyObject *
2329PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002331 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002332 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002333 PyErr_SetString(PyExc_ValueError,
2334 "chr() arg not in range(0x110000)");
2335 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002336 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 if (ordinal < 256)
2339 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 v = PyUnicode_New(1, ordinal);
2342 if (v == NULL)
2343 return NULL;
2344 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2345 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002346}
2347
Alexander Belopolsky40018472011-02-26 01:02:56 +00002348PyObject *
2349PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002351 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002352 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002353 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002354 if (PyUnicode_READY(obj))
2355 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002356 Py_INCREF(obj);
2357 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002358 }
2359 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002360 /* For a Unicode subtype that's not a Unicode object,
2361 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002362 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002363 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002364 PyErr_Format(PyExc_TypeError,
2365 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002366 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002367 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002368}
2369
Alexander Belopolsky40018472011-02-26 01:02:56 +00002370PyObject *
2371PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002372 const char *encoding,
2373 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002374{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002375 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002376 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002377
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 PyErr_BadInternalCall();
2380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002382
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002383 /* Decoding bytes objects is the most common case and should be fast */
2384 if (PyBytes_Check(obj)) {
2385 if (PyBytes_GET_SIZE(obj) == 0) {
2386 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002387 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002388 }
2389 else {
2390 v = PyUnicode_Decode(
2391 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2392 encoding, errors);
2393 }
2394 return v;
2395 }
2396
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002397 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002398 PyErr_SetString(PyExc_TypeError,
2399 "decoding str is not supported");
2400 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002402
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002403 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2404 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2405 PyErr_Format(PyExc_TypeError,
2406 "coercing to str: need bytes, bytearray "
2407 "or buffer-like object, %.80s found",
2408 Py_TYPE(obj)->tp_name);
2409 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002410 }
Tim Petersced69f82003-09-16 20:30:58 +00002411
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002412 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002413 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002414 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002415 }
Tim Petersced69f82003-09-16 20:30:58 +00002416 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002417 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002418
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002419 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002420 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421}
2422
Victor Stinner600d3be2010-06-10 12:00:55 +00002423/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002424 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2425 1 on success. */
2426static int
2427normalize_encoding(const char *encoding,
2428 char *lower,
2429 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002430{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002431 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002432 char *l;
2433 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002434
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002435 e = encoding;
2436 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002437 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002438 while (*e) {
2439 if (l == l_end)
2440 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002441 if (Py_ISUPPER(*e)) {
2442 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002443 }
2444 else if (*e == '_') {
2445 *l++ = '-';
2446 e++;
2447 }
2448 else {
2449 *l++ = *e++;
2450 }
2451 }
2452 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002453 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002454}
2455
Alexander Belopolsky40018472011-02-26 01:02:56 +00002456PyObject *
2457PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002458 Py_ssize_t size,
2459 const char *encoding,
2460 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002461{
2462 PyObject *buffer = NULL, *unicode;
2463 Py_buffer info;
2464 char lower[11]; /* Enough for any encoding shortcut */
2465
2466 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002467 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002468
2469 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002470 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002471 if ((strcmp(lower, "utf-8") == 0) ||
2472 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002473 return PyUnicode_DecodeUTF8(s, size, errors);
2474 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002475 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002476 (strcmp(lower, "iso-8859-1") == 0))
2477 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002478#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002479 else if (strcmp(lower, "mbcs") == 0)
2480 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002481#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002482 else if (strcmp(lower, "ascii") == 0)
2483 return PyUnicode_DecodeASCII(s, size, errors);
2484 else if (strcmp(lower, "utf-16") == 0)
2485 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2486 else if (strcmp(lower, "utf-32") == 0)
2487 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489
2490 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002491 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002492 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002493 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002494 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495 if (buffer == NULL)
2496 goto onError;
2497 unicode = PyCodec_Decode(buffer, encoding, errors);
2498 if (unicode == NULL)
2499 goto onError;
2500 if (!PyUnicode_Check(unicode)) {
2501 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002502 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002503 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 Py_DECREF(unicode);
2505 goto onError;
2506 }
2507 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 if (PyUnicode_READY(unicode)) {
2509 Py_DECREF(unicode);
2510 return NULL;
2511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002513
Benjamin Peterson29060642009-01-31 22:14:21 +00002514 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 Py_XDECREF(buffer);
2516 return NULL;
2517}
2518
Alexander Belopolsky40018472011-02-26 01:02:56 +00002519PyObject *
2520PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002521 const char *encoding,
2522 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002523{
2524 PyObject *v;
2525
2526 if (!PyUnicode_Check(unicode)) {
2527 PyErr_BadArgument();
2528 goto onError;
2529 }
2530
2531 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002532 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002533
2534 /* Decode via the codec registry */
2535 v = PyCodec_Decode(unicode, encoding, errors);
2536 if (v == NULL)
2537 goto onError;
2538 return v;
2539
Benjamin Peterson29060642009-01-31 22:14:21 +00002540 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002541 return NULL;
2542}
2543
Alexander Belopolsky40018472011-02-26 01:02:56 +00002544PyObject *
2545PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002546 const char *encoding,
2547 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002548{
2549 PyObject *v;
2550
2551 if (!PyUnicode_Check(unicode)) {
2552 PyErr_BadArgument();
2553 goto onError;
2554 }
2555
2556 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002557 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002558
2559 /* Decode via the codec registry */
2560 v = PyCodec_Decode(unicode, encoding, errors);
2561 if (v == NULL)
2562 goto onError;
2563 if (!PyUnicode_Check(v)) {
2564 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002565 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002566 Py_TYPE(v)->tp_name);
2567 Py_DECREF(v);
2568 goto onError;
2569 }
2570 return v;
2571
Benjamin Peterson29060642009-01-31 22:14:21 +00002572 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002573 return NULL;
2574}
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002578 Py_ssize_t size,
2579 const char *encoding,
2580 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581{
2582 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002583
Guido van Rossumd57fd912000-03-10 22:53:23 +00002584 unicode = PyUnicode_FromUnicode(s, size);
2585 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2588 Py_DECREF(unicode);
2589 return v;
2590}
2591
Alexander Belopolsky40018472011-02-26 01:02:56 +00002592PyObject *
2593PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002594 const char *encoding,
2595 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002596{
2597 PyObject *v;
2598
2599 if (!PyUnicode_Check(unicode)) {
2600 PyErr_BadArgument();
2601 goto onError;
2602 }
2603
2604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002606
2607 /* Encode via the codec registry */
2608 v = PyCodec_Encode(unicode, encoding, errors);
2609 if (v == NULL)
2610 goto onError;
2611 return v;
2612
Benjamin Peterson29060642009-01-31 22:14:21 +00002613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002614 return NULL;
2615}
2616
Victor Stinnerad158722010-10-27 00:25:46 +00002617PyObject *
2618PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002619{
Victor Stinner99b95382011-07-04 14:23:54 +02002620#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2622 PyUnicode_GET_SIZE(unicode),
2623 NULL);
2624#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002626#else
Victor Stinner793b5312011-04-27 00:24:21 +02002627 PyInterpreterState *interp = PyThreadState_GET()->interp;
2628 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2629 cannot use it to encode and decode filenames before it is loaded. Load
2630 the Python codec requires to encode at least its own filename. Use the C
2631 version of the locale codec until the codec registry is initialized and
2632 the Python codec is loaded.
2633
2634 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2635 cannot only rely on it: check also interp->fscodec_initialized for
2636 subinterpreters. */
2637 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002638 return PyUnicode_AsEncodedString(unicode,
2639 Py_FileSystemDefaultEncoding,
2640 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002641 }
2642 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002643 /* locale encoding with surrogateescape */
2644 wchar_t *wchar;
2645 char *bytes;
2646 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002647 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002648
2649 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2650 if (wchar == NULL)
2651 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002652 bytes = _Py_wchar2char(wchar, &error_pos);
2653 if (bytes == NULL) {
2654 if (error_pos != (size_t)-1) {
2655 char *errmsg = strerror(errno);
2656 PyObject *exc = NULL;
2657 if (errmsg == NULL)
2658 errmsg = "Py_wchar2char() failed";
2659 raise_encode_exception(&exc,
2660 "filesystemencoding",
2661 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2662 error_pos, error_pos+1,
2663 errmsg);
2664 Py_XDECREF(exc);
2665 }
2666 else
2667 PyErr_NoMemory();
2668 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002669 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002670 }
2671 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002672
2673 bytes_obj = PyBytes_FromString(bytes);
2674 PyMem_Free(bytes);
2675 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002676 }
Victor Stinnerad158722010-10-27 00:25:46 +00002677#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002678}
2679
Alexander Belopolsky40018472011-02-26 01:02:56 +00002680PyObject *
2681PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002682 const char *encoding,
2683 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684{
2685 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002686 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002687
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688 if (!PyUnicode_Check(unicode)) {
2689 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691 }
Fred Drakee4315f52000-05-09 19:53:39 +00002692
Victor Stinner2f283c22011-03-02 01:21:46 +00002693 if (encoding == NULL) {
2694 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002696 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002698 }
Fred Drakee4315f52000-05-09 19:53:39 +00002699
2700 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002701 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002702 if ((strcmp(lower, "utf-8") == 0) ||
2703 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002704 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002705 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002707 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002709 }
Victor Stinner37296e82010-06-10 13:36:23 +00002710 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002711 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002712 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002714#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002715 else if (strcmp(lower, "mbcs") == 0)
2716 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2717 PyUnicode_GET_SIZE(unicode),
2718 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002719#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002720 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723
2724 /* Encode via the codec registry */
2725 v = PyCodec_Encode(unicode, encoding, errors);
2726 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002727 return NULL;
2728
2729 /* The normal path */
2730 if (PyBytes_Check(v))
2731 return v;
2732
2733 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002734 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002735 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002736 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002737
2738 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2739 "encoder %s returned bytearray instead of bytes",
2740 encoding);
2741 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002742 Py_DECREF(v);
2743 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002744 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002745
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002746 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2747 Py_DECREF(v);
2748 return b;
2749 }
2750
2751 PyErr_Format(PyExc_TypeError,
2752 "encoder did not return a bytes object (type=%.400s)",
2753 Py_TYPE(v)->tp_name);
2754 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002755 return NULL;
2756}
2757
Alexander Belopolsky40018472011-02-26 01:02:56 +00002758PyObject *
2759PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002760 const char *encoding,
2761 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002762{
2763 PyObject *v;
2764
2765 if (!PyUnicode_Check(unicode)) {
2766 PyErr_BadArgument();
2767 goto onError;
2768 }
2769
2770 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002772
2773 /* Encode via the codec registry */
2774 v = PyCodec_Encode(unicode, encoding, errors);
2775 if (v == NULL)
2776 goto onError;
2777 if (!PyUnicode_Check(v)) {
2778 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002779 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002780 Py_TYPE(v)->tp_name);
2781 Py_DECREF(v);
2782 goto onError;
2783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002785
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 return NULL;
2788}
2789
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002790PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002791PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002792 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002793 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2794}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002795
Christian Heimes5894ba72007-11-04 11:43:14 +00002796PyObject*
2797PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2798{
Victor Stinner99b95382011-07-04 14:23:54 +02002799#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002800 return PyUnicode_DecodeMBCS(s, size, NULL);
2801#elif defined(__APPLE__)
2802 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2803#else
Victor Stinner793b5312011-04-27 00:24:21 +02002804 PyInterpreterState *interp = PyThreadState_GET()->interp;
2805 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2806 cannot use it to encode and decode filenames before it is loaded. Load
2807 the Python codec requires to encode at least its own filename. Use the C
2808 version of the locale codec until the codec registry is initialized and
2809 the Python codec is loaded.
2810
2811 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2812 cannot only rely on it: check also interp->fscodec_initialized for
2813 subinterpreters. */
2814 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002815 return PyUnicode_Decode(s, size,
2816 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002817 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002818 }
2819 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002820 /* locale encoding with surrogateescape */
2821 wchar_t *wchar;
2822 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002823 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002824
2825 if (s[size] != '\0' || size != strlen(s)) {
2826 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2827 return NULL;
2828 }
2829
Victor Stinner168e1172010-10-16 23:16:16 +00002830 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002831 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002832 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002833
Victor Stinner168e1172010-10-16 23:16:16 +00002834 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002835 PyMem_Free(wchar);
2836 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002837 }
Victor Stinnerad158722010-10-27 00:25:46 +00002838#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002839}
2840
Martin v. Löwis011e8422009-05-05 04:43:17 +00002841
2842int
2843PyUnicode_FSConverter(PyObject* arg, void* addr)
2844{
2845 PyObject *output = NULL;
2846 Py_ssize_t size;
2847 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002848 if (arg == NULL) {
2849 Py_DECREF(*(PyObject**)addr);
2850 return 1;
2851 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002852 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002853 output = arg;
2854 Py_INCREF(output);
2855 }
2856 else {
2857 arg = PyUnicode_FromObject(arg);
2858 if (!arg)
2859 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002860 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002861 Py_DECREF(arg);
2862 if (!output)
2863 return 0;
2864 if (!PyBytes_Check(output)) {
2865 Py_DECREF(output);
2866 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2867 return 0;
2868 }
2869 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002870 size = PyBytes_GET_SIZE(output);
2871 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002872 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002873 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002874 Py_DECREF(output);
2875 return 0;
2876 }
2877 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002878 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002879}
2880
2881
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002882int
2883PyUnicode_FSDecoder(PyObject* arg, void* addr)
2884{
2885 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002886 if (arg == NULL) {
2887 Py_DECREF(*(PyObject**)addr);
2888 return 1;
2889 }
2890 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002891 if (PyUnicode_READY(arg))
2892 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002893 output = arg;
2894 Py_INCREF(output);
2895 }
2896 else {
2897 arg = PyBytes_FromObject(arg);
2898 if (!arg)
2899 return 0;
2900 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2901 PyBytes_GET_SIZE(arg));
2902 Py_DECREF(arg);
2903 if (!output)
2904 return 0;
2905 if (!PyUnicode_Check(output)) {
2906 Py_DECREF(output);
2907 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2908 return 0;
2909 }
2910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2912 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002913 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2914 Py_DECREF(output);
2915 return 0;
2916 }
2917 *(PyObject**)addr = output;
2918 return Py_CLEANUP_SUPPORTED;
2919}
2920
2921
Martin v. Löwis5b222132007-06-10 09:51:05 +00002922char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002923PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002924{
Christian Heimesf3863112007-11-22 07:46:41 +00002925 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002926 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2927
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002928 if (!PyUnicode_Check(unicode)) {
2929 PyErr_BadArgument();
2930 return NULL;
2931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002932 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002933 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002934
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002935 if (PyUnicode_UTF8(unicode) == NULL) {
2936 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2938 if (bytes == NULL)
2939 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002940 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2941 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942 Py_DECREF(bytes);
2943 return NULL;
2944 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002945 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2946 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947 Py_DECREF(bytes);
2948 }
2949
2950 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002951 *psize = PyUnicode_UTF8_LENGTH(unicode);
2952 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002953}
2954
2955char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2959}
2960
2961#ifdef Py_DEBUG
2962int unicode_as_unicode_calls = 0;
2963#endif
2964
2965
2966Py_UNICODE *
2967PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2968{
2969 PyUnicodeObject *u;
2970 const unsigned char *one_byte;
2971#if SIZEOF_WCHAR_T == 4
2972 const Py_UCS2 *two_bytes;
2973#else
2974 const Py_UCS4 *four_bytes;
2975 const Py_UCS4 *ucs4_end;
2976 Py_ssize_t num_surrogates;
2977#endif
2978 wchar_t *w;
2979 wchar_t *wchar_end;
2980
2981 if (!PyUnicode_Check(unicode)) {
2982 PyErr_BadArgument();
2983 return NULL;
2984 }
2985 u = (PyUnicodeObject*)unicode;
2986 if (_PyUnicode_WSTR(u) == NULL) {
2987 /* Non-ASCII compact unicode object */
2988 assert(_PyUnicode_KIND(u) != 0);
2989 assert(PyUnicode_IS_READY(u));
2990
2991#ifdef Py_DEBUG
2992 ++unicode_as_unicode_calls;
2993#endif
2994
2995 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2996#if SIZEOF_WCHAR_T == 2
2997 four_bytes = PyUnicode_4BYTE_DATA(u);
2998 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2999 num_surrogates = 0;
3000
3001 for (; four_bytes < ucs4_end; ++four_bytes) {
3002 if (*four_bytes > 0xFFFF)
3003 ++num_surrogates;
3004 }
3005
3006 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3007 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3008 if (!_PyUnicode_WSTR(u)) {
3009 PyErr_NoMemory();
3010 return NULL;
3011 }
3012 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3013
3014 w = _PyUnicode_WSTR(u);
3015 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3016 four_bytes = PyUnicode_4BYTE_DATA(u);
3017 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3018 if (*four_bytes > 0xFFFF) {
3019 /* encode surrogate pair in this case */
3020 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3021 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3022 }
3023 else
3024 *w = *four_bytes;
3025
3026 if (w > wchar_end) {
3027 assert(0 && "Miscalculated string end");
3028 }
3029 }
3030 *w = 0;
3031#else
3032 /* sizeof(wchar_t) == 4 */
3033 Py_FatalError("Impossible unicode object state, wstr and str "
3034 "should share memory already.");
3035 return NULL;
3036#endif
3037 }
3038 else {
3039 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3040 (_PyUnicode_LENGTH(u) + 1));
3041 if (!_PyUnicode_WSTR(u)) {
3042 PyErr_NoMemory();
3043 return NULL;
3044 }
3045 if (!PyUnicode_IS_COMPACT_ASCII(u))
3046 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3047 w = _PyUnicode_WSTR(u);
3048 wchar_end = w + _PyUnicode_LENGTH(u);
3049
3050 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3051 one_byte = PyUnicode_1BYTE_DATA(u);
3052 for (; w < wchar_end; ++one_byte, ++w)
3053 *w = *one_byte;
3054 /* null-terminate the wstr */
3055 *w = 0;
3056 }
3057 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3058#if SIZEOF_WCHAR_T == 4
3059 two_bytes = PyUnicode_2BYTE_DATA(u);
3060 for (; w < wchar_end; ++two_bytes, ++w)
3061 *w = *two_bytes;
3062 /* null-terminate the wstr */
3063 *w = 0;
3064#else
3065 /* sizeof(wchar_t) == 2 */
3066 PyObject_FREE(_PyUnicode_WSTR(u));
3067 _PyUnicode_WSTR(u) = NULL;
3068 Py_FatalError("Impossible unicode object state, wstr "
3069 "and str should share memory already.");
3070 return NULL;
3071#endif
3072 }
3073 else {
3074 assert(0 && "This should never happen.");
3075 }
3076 }
3077 }
3078 if (size != NULL)
3079 *size = PyUnicode_WSTR_LENGTH(u);
3080 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003081}
3082
Alexander Belopolsky40018472011-02-26 01:02:56 +00003083Py_UNICODE *
3084PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003086 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087}
3088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090Py_ssize_t
3091PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092{
3093 if (!PyUnicode_Check(unicode)) {
3094 PyErr_BadArgument();
3095 goto onError;
3096 }
3097 return PyUnicode_GET_SIZE(unicode);
3098
Benjamin Peterson29060642009-01-31 22:14:21 +00003099 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 return -1;
3101}
3102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003103Py_ssize_t
3104PyUnicode_GetLength(PyObject *unicode)
3105{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003106 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003107 PyErr_BadArgument();
3108 return -1;
3109 }
3110
3111 return PyUnicode_GET_LENGTH(unicode);
3112}
3113
3114Py_UCS4
3115PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3116{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003117 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3118 PyErr_BadArgument();
3119 return (Py_UCS4)-1;
3120 }
3121 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3122 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 return (Py_UCS4)-1;
3124 }
3125 return PyUnicode_READ_CHAR(unicode, index);
3126}
3127
3128int
3129PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3130{
3131 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003132 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003133 return -1;
3134 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003135 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3136 PyErr_SetString(PyExc_IndexError, "string index out of range");
3137 return -1;
3138 }
3139 if (_PyUnicode_Dirty(unicode))
3140 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3142 index, ch);
3143 return 0;
3144}
3145
Alexander Belopolsky40018472011-02-26 01:02:56 +00003146const char *
3147PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003148{
Victor Stinner42cb4622010-09-01 19:39:01 +00003149 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003150}
3151
Victor Stinner554f3f02010-06-16 23:33:54 +00003152/* create or adjust a UnicodeDecodeError */
3153static void
3154make_decode_exception(PyObject **exceptionObject,
3155 const char *encoding,
3156 const char *input, Py_ssize_t length,
3157 Py_ssize_t startpos, Py_ssize_t endpos,
3158 const char *reason)
3159{
3160 if (*exceptionObject == NULL) {
3161 *exceptionObject = PyUnicodeDecodeError_Create(
3162 encoding, input, length, startpos, endpos, reason);
3163 }
3164 else {
3165 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3166 goto onError;
3167 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3168 goto onError;
3169 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3170 goto onError;
3171 }
3172 return;
3173
3174onError:
3175 Py_DECREF(*exceptionObject);
3176 *exceptionObject = NULL;
3177}
3178
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003179/* error handling callback helper:
3180 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003181 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003182 and adjust various state variables.
3183 return 0 on success, -1 on error
3184*/
3185
Alexander Belopolsky40018472011-02-26 01:02:56 +00003186static int
3187unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003188 const char *encoding, const char *reason,
3189 const char **input, const char **inend, Py_ssize_t *startinpos,
3190 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3191 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003193 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003194
3195 PyObject *restuple = NULL;
3196 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003197 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003198 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003199 Py_ssize_t requiredsize;
3200 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003201 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003202 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003203 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204 int res = -1;
3205
3206 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003207 *errorHandler = PyCodec_LookupError(errors);
3208 if (*errorHandler == NULL)
3209 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003210 }
3211
Victor Stinner554f3f02010-06-16 23:33:54 +00003212 make_decode_exception(exceptionObject,
3213 encoding,
3214 *input, *inend - *input,
3215 *startinpos, *endinpos,
3216 reason);
3217 if (*exceptionObject == NULL)
3218 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219
3220 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3221 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003224 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 }
3227 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003229
3230 /* Copy back the bytes variables, which might have been modified by the
3231 callback */
3232 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3233 if (!inputobj)
3234 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003235 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003237 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003238 *input = PyBytes_AS_STRING(inputobj);
3239 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003240 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003241 /* we can DECREF safely, as the exception has another reference,
3242 so the object won't go away. */
3243 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003247 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3249 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003250 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251
3252 /* need more space? (at least enough for what we
3253 have+the replacement+the rest of the string (starting
3254 at the new input position), so we won't have to check space
3255 when there are no errors in the rest of the string) */
3256 repptr = PyUnicode_AS_UNICODE(repunicode);
3257 repsize = PyUnicode_GET_SIZE(repunicode);
3258 requiredsize = *outpos + repsize + insize-newpos;
3259 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 if (requiredsize<2*outsize)
3261 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003262 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 goto onError;
3264 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265 }
3266 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003267 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003268 Py_UNICODE_COPY(*outptr, repptr, repsize);
3269 *outptr += repsize;
3270 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 /* we made it! */
3273 res = 0;
3274
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 Py_XDECREF(restuple);
3277 return res;
3278}
3279
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003280/* --- UTF-7 Codec -------------------------------------------------------- */
3281
Antoine Pitrou244651a2009-05-04 18:56:13 +00003282/* See RFC2152 for details. We encode conservatively and decode liberally. */
3283
3284/* Three simple macros defining base-64. */
3285
3286/* Is c a base-64 character? */
3287
3288#define IS_BASE64(c) \
3289 (((c) >= 'A' && (c) <= 'Z') || \
3290 ((c) >= 'a' && (c) <= 'z') || \
3291 ((c) >= '0' && (c) <= '9') || \
3292 (c) == '+' || (c) == '/')
3293
3294/* given that c is a base-64 character, what is its base-64 value? */
3295
3296#define FROM_BASE64(c) \
3297 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3298 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3299 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3300 (c) == '+' ? 62 : 63)
3301
3302/* What is the base-64 character of the bottom 6 bits of n? */
3303
3304#define TO_BASE64(n) \
3305 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3306
3307/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3308 * decoded as itself. We are permissive on decoding; the only ASCII
3309 * byte not decoding to itself is the + which begins a base64
3310 * string. */
3311
3312#define DECODE_DIRECT(c) \
3313 ((c) <= 127 && (c) != '+')
3314
3315/* The UTF-7 encoder treats ASCII characters differently according to
3316 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3317 * the above). See RFC2152. This array identifies these different
3318 * sets:
3319 * 0 : "Set D"
3320 * alphanumeric and '(),-./:?
3321 * 1 : "Set O"
3322 * !"#$%&*;<=>@[]^_`{|}
3323 * 2 : "whitespace"
3324 * ht nl cr sp
3325 * 3 : special (must be base64 encoded)
3326 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3327 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003328
Tim Petersced69f82003-09-16 20:30:58 +00003329static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003330char utf7_category[128] = {
3331/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3332 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3333/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3334 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3335/* sp ! " # $ % & ' ( ) * + , - . / */
3336 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3337/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3339/* @ A B C D E F G H I J K L M N O */
3340 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3341/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3343/* ` a b c d e f g h i j k l m n o */
3344 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3345/* p q r s t u v w x y z { | } ~ del */
3346 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003347};
3348
Antoine Pitrou244651a2009-05-04 18:56:13 +00003349/* ENCODE_DIRECT: this character should be encoded as itself. The
3350 * answer depends on whether we are encoding set O as itself, and also
3351 * on whether we are encoding whitespace as itself. RFC2152 makes it
3352 * clear that the answers to these questions vary between
3353 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003354
Antoine Pitrou244651a2009-05-04 18:56:13 +00003355#define ENCODE_DIRECT(c, directO, directWS) \
3356 ((c) < 128 && (c) > 0 && \
3357 ((utf7_category[(c)] == 0) || \
3358 (directWS && (utf7_category[(c)] == 2)) || \
3359 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003360
Alexander Belopolsky40018472011-02-26 01:02:56 +00003361PyObject *
3362PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003363 Py_ssize_t size,
3364 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003365{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003366 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3367}
3368
Antoine Pitrou244651a2009-05-04 18:56:13 +00003369/* The decoder. The only state we preserve is our read position,
3370 * i.e. how many characters we have consumed. So if we end in the
3371 * middle of a shift sequence we have to back off the read position
3372 * and the output to the beginning of the sequence, otherwise we lose
3373 * all the shift state (seen bits, number of bits seen, high
3374 * surrogate). */
3375
Alexander Belopolsky40018472011-02-26 01:02:56 +00003376PyObject *
3377PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003378 Py_ssize_t size,
3379 const char *errors,
3380 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003381{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003383 Py_ssize_t startinpos;
3384 Py_ssize_t endinpos;
3385 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003386 const char *e;
3387 PyUnicodeObject *unicode;
3388 Py_UNICODE *p;
3389 const char *errmsg = "";
3390 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003391 Py_UNICODE *shiftOutStart;
3392 unsigned int base64bits = 0;
3393 unsigned long base64buffer = 0;
3394 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003395 PyObject *errorHandler = NULL;
3396 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003397
3398 unicode = _PyUnicode_New(size);
3399 if (!unicode)
3400 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003401 if (size == 0) {
3402 if (consumed)
3403 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003404 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003405 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003408 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003409 e = s + size;
3410
3411 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003413 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003414 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003415
Antoine Pitrou244651a2009-05-04 18:56:13 +00003416 if (inShift) { /* in a base-64 section */
3417 if (IS_BASE64(ch)) { /* consume a base-64 character */
3418 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3419 base64bits += 6;
3420 s++;
3421 if (base64bits >= 16) {
3422 /* we have enough bits for a UTF-16 value */
3423 Py_UNICODE outCh = (Py_UNICODE)
3424 (base64buffer >> (base64bits-16));
3425 base64bits -= 16;
3426 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3427 if (surrogate) {
3428 /* expecting a second surrogate */
3429 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3430#ifdef Py_UNICODE_WIDE
3431 *p++ = (((surrogate & 0x3FF)<<10)
3432 | (outCh & 0x3FF)) + 0x10000;
3433#else
3434 *p++ = surrogate;
3435 *p++ = outCh;
3436#endif
3437 surrogate = 0;
3438 }
3439 else {
3440 surrogate = 0;
3441 errmsg = "second surrogate missing";
3442 goto utf7Error;
3443 }
3444 }
3445 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3446 /* first surrogate */
3447 surrogate = outCh;
3448 }
3449 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3450 errmsg = "unexpected second surrogate";
3451 goto utf7Error;
3452 }
3453 else {
3454 *p++ = outCh;
3455 }
3456 }
3457 }
3458 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003459 inShift = 0;
3460 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003461 if (surrogate) {
3462 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003463 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003464 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003465 if (base64bits > 0) { /* left-over bits */
3466 if (base64bits >= 6) {
3467 /* We've seen at least one base-64 character */
3468 errmsg = "partial character in shift sequence";
3469 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003471 else {
3472 /* Some bits remain; they should be zero */
3473 if (base64buffer != 0) {
3474 errmsg = "non-zero padding bits in shift sequence";
3475 goto utf7Error;
3476 }
3477 }
3478 }
3479 if (ch != '-') {
3480 /* '-' is absorbed; other terminating
3481 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003482 *p++ = ch;
3483 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003484 }
3485 }
3486 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003487 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003488 s++; /* consume '+' */
3489 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003490 s++;
3491 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003492 }
3493 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003494 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003495 shiftOutStart = p;
3496 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003497 }
3498 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003499 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003500 *p++ = ch;
3501 s++;
3502 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003503 else {
3504 startinpos = s-starts;
3505 s++;
3506 errmsg = "unexpected special character";
3507 goto utf7Error;
3508 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003509 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003510utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003511 outpos = p-PyUnicode_AS_UNICODE(unicode);
3512 endinpos = s-starts;
3513 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003514 errors, &errorHandler,
3515 "utf7", errmsg,
3516 &starts, &e, &startinpos, &endinpos, &exc, &s,
3517 &unicode, &outpos, &p))
3518 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003519 }
3520
Antoine Pitrou244651a2009-05-04 18:56:13 +00003521 /* end of string */
3522
3523 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3524 /* if we're in an inconsistent state, that's an error */
3525 if (surrogate ||
3526 (base64bits >= 6) ||
3527 (base64bits > 0 && base64buffer != 0)) {
3528 outpos = p-PyUnicode_AS_UNICODE(unicode);
3529 endinpos = size;
3530 if (unicode_decode_call_errorhandler(
3531 errors, &errorHandler,
3532 "utf7", "unterminated shift sequence",
3533 &starts, &e, &startinpos, &endinpos, &exc, &s,
3534 &unicode, &outpos, &p))
3535 goto onError;
3536 if (s < e)
3537 goto restart;
3538 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003539 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003540
3541 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003542 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003543 if (inShift) {
3544 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003545 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003546 }
3547 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003548 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003549 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003550 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003551
Victor Stinnerfe226c02011-10-03 03:52:20 +02003552 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003553 goto onError;
3554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 Py_XDECREF(errorHandler);
3556 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003557 if (PyUnicode_READY(unicode) == -1) {
3558 Py_DECREF(unicode);
3559 return NULL;
3560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561 return (PyObject *)unicode;
3562
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564 Py_XDECREF(errorHandler);
3565 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003566 Py_DECREF(unicode);
3567 return NULL;
3568}
3569
3570
Alexander Belopolsky40018472011-02-26 01:02:56 +00003571PyObject *
3572PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003573 Py_ssize_t size,
3574 int base64SetO,
3575 int base64WhiteSpace,
3576 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003577{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003578 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003579 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003580 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003581 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003583 unsigned int base64bits = 0;
3584 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 char * out;
3586 char * start;
3587
3588 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003590
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003591 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003592 return PyErr_NoMemory();
3593
Antoine Pitrou244651a2009-05-04 18:56:13 +00003594 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003595 if (v == NULL)
3596 return NULL;
3597
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003598 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003599 for (;i < size; ++i) {
3600 Py_UNICODE ch = s[i];
3601
Antoine Pitrou244651a2009-05-04 18:56:13 +00003602 if (inShift) {
3603 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3604 /* shifting out */
3605 if (base64bits) { /* output remaining bits */
3606 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3607 base64buffer = 0;
3608 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003609 }
3610 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003611 /* Characters not in the BASE64 set implicitly unshift the sequence
3612 so no '-' is required, except if the character is itself a '-' */
3613 if (IS_BASE64(ch) || ch == '-') {
3614 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003615 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003616 *out++ = (char) ch;
3617 }
3618 else {
3619 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003620 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003621 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003622 else { /* not in a shift sequence */
3623 if (ch == '+') {
3624 *out++ = '+';
3625 *out++ = '-';
3626 }
3627 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3628 *out++ = (char) ch;
3629 }
3630 else {
3631 *out++ = '+';
3632 inShift = 1;
3633 goto encode_char;
3634 }
3635 }
3636 continue;
3637encode_char:
3638#ifdef Py_UNICODE_WIDE
3639 if (ch >= 0x10000) {
3640 /* code first surrogate */
3641 base64bits += 16;
3642 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3643 while (base64bits >= 6) {
3644 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3645 base64bits -= 6;
3646 }
3647 /* prepare second surrogate */
3648 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3649 }
3650#endif
3651 base64bits += 16;
3652 base64buffer = (base64buffer << 16) | ch;
3653 while (base64bits >= 6) {
3654 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3655 base64bits -= 6;
3656 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003657 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003658 if (base64bits)
3659 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3660 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003661 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003662 if (_PyBytes_Resize(&v, out - start) < 0)
3663 return NULL;
3664 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665}
3666
Antoine Pitrou244651a2009-05-04 18:56:13 +00003667#undef IS_BASE64
3668#undef FROM_BASE64
3669#undef TO_BASE64
3670#undef DECODE_DIRECT
3671#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673/* --- UTF-8 Codec -------------------------------------------------------- */
3674
Tim Petersced69f82003-09-16 20:30:58 +00003675static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003677 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3678 illegal prefix. See RFC 3629 for details */
3679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3683 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3684 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3685 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3691 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3692 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3693 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3694 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695};
3696
Alexander Belopolsky40018472011-02-26 01:02:56 +00003697PyObject *
3698PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003699 Py_ssize_t size,
3700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701{
Walter Dörwald69652032004-09-07 20:24:22 +00003702 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3703}
3704
Antoine Pitrouab868312009-01-10 15:40:25 +00003705/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3706#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3707
3708/* Mask to quickly check whether a C 'long' contains a
3709 non-ASCII, UTF8-encoded char. */
3710#if (SIZEOF_LONG == 8)
3711# define ASCII_CHAR_MASK 0x8080808080808080L
3712#elif (SIZEOF_LONG == 4)
3713# define ASCII_CHAR_MASK 0x80808080L
3714#else
3715# error C 'long' size should be either 4 or 8!
3716#endif
3717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718/* Scans a UTF-8 string and returns the maximum character to be expected,
3719 the size of the decoded unicode string and if any major errors were
3720 encountered.
3721
3722 This function does check basic UTF-8 sanity, it does however NOT CHECK
3723 if the string contains surrogates, and if all continuation bytes are
3724 within the correct ranges, these checks are performed in
3725 PyUnicode_DecodeUTF8Stateful.
3726
3727 If it sets has_errors to 1, it means the value of unicode_size and max_char
3728 will be bogus and you should not rely on useful information in them.
3729 */
3730static Py_UCS4
3731utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3732 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3733 int *has_errors)
3734{
3735 Py_ssize_t n;
3736 Py_ssize_t char_count = 0;
3737 Py_UCS4 max_char = 127, new_max;
3738 Py_UCS4 upper_bound;
3739 const unsigned char *p = (const unsigned char *)s;
3740 const unsigned char *end = p + string_size;
3741 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3742 int err = 0;
3743
3744 for (; p < end && !err; ++p, ++char_count) {
3745 /* Only check value if it's not a ASCII char... */
3746 if (*p < 0x80) {
3747 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3748 an explanation. */
3749 if (!((size_t) p & LONG_PTR_MASK)) {
3750 /* Help register allocation */
3751 register const unsigned char *_p = p;
3752 while (_p < aligned_end) {
3753 unsigned long value = *(unsigned long *) _p;
3754 if (value & ASCII_CHAR_MASK)
3755 break;
3756 _p += SIZEOF_LONG;
3757 char_count += SIZEOF_LONG;
3758 }
3759 p = _p;
3760 if (p == end)
3761 break;
3762 }
3763 }
3764 if (*p >= 0x80) {
3765 n = utf8_code_length[*p];
3766 new_max = max_char;
3767 switch (n) {
3768 /* invalid start byte */
3769 case 0:
3770 err = 1;
3771 break;
3772 case 2:
3773 /* Code points between 0x00FF and 0x07FF inclusive.
3774 Approximate the upper bound of the code point,
3775 if this flips over 255 we can be sure it will be more
3776 than 255 and the string will need 2 bytes per code coint,
3777 if it stays under or equal to 255, we can be sure 1 byte
3778 is enough.
3779 ((*p & 0b00011111) << 6) | 0b00111111 */
3780 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3781 if (max_char < upper_bound)
3782 new_max = upper_bound;
3783 /* Ensure we track at least that we left ASCII space. */
3784 if (new_max < 128)
3785 new_max = 128;
3786 break;
3787 case 3:
3788 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3789 always > 255 and <= 65535 and will always need 2 bytes. */
3790 if (max_char < 65535)
3791 new_max = 65535;
3792 break;
3793 case 4:
3794 /* Code point will be above 0xFFFF for sure in this case. */
3795 new_max = 65537;
3796 break;
3797 /* Internal error, this should be caught by the first if */
3798 case 1:
3799 default:
3800 assert(0 && "Impossible case in utf8_max_char_and_size");
3801 err = 1;
3802 }
3803 /* Instead of number of overall bytes for this code point,
3804 n containts the number of following bytes: */
3805 --n;
3806 /* Check if the follow up chars are all valid continuation bytes */
3807 if (n >= 1) {
3808 const unsigned char *cont;
3809 if ((p + n) >= end) {
3810 if (consumed == 0)
3811 /* incomplete data, non-incremental decoding */
3812 err = 1;
3813 break;
3814 }
3815 for (cont = p + 1; cont < (p + n); ++cont) {
3816 if ((*cont & 0xc0) != 0x80) {
3817 err = 1;
3818 break;
3819 }
3820 }
3821 p += n;
3822 }
3823 else
3824 err = 1;
3825 max_char = new_max;
3826 }
3827 }
3828
3829 if (unicode_size)
3830 *unicode_size = char_count;
3831 if (has_errors)
3832 *has_errors = err;
3833 return max_char;
3834}
3835
3836/* Similar to PyUnicode_WRITE but can also write into wstr field
3837 of the legacy unicode representation */
3838#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3839 do { \
3840 const int k_ = (kind); \
3841 if (k_ == PyUnicode_WCHAR_KIND) \
3842 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3843 else if (k_ == PyUnicode_1BYTE_KIND) \
3844 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3845 else if (k_ == PyUnicode_2BYTE_KIND) \
3846 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3847 else \
3848 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3849 } while (0)
3850
Alexander Belopolsky40018472011-02-26 01:02:56 +00003851PyObject *
3852PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853 Py_ssize_t size,
3854 const char *errors,
3855 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003856{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003857 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003859 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003860 Py_ssize_t startinpos;
3861 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003862 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003864 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 PyObject *errorHandler = NULL;
3866 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003867 Py_UCS4 maxchar = 0;
3868 Py_ssize_t unicode_size;
3869 Py_ssize_t i;
3870 int kind;
3871 void *data;
3872 int has_errors;
3873 Py_UNICODE *error_outptr;
3874#if SIZEOF_WCHAR_T == 2
3875 Py_ssize_t wchar_offset = 0;
3876#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877
Walter Dörwald69652032004-09-07 20:24:22 +00003878 if (size == 0) {
3879 if (consumed)
3880 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3884 consumed, &has_errors);
3885 if (has_errors) {
3886 unicode = _PyUnicode_New(size);
3887 if (!unicode)
3888 return NULL;
3889 kind = PyUnicode_WCHAR_KIND;
3890 data = PyUnicode_AS_UNICODE(unicode);
3891 assert(data != NULL);
3892 }
3893 else {
3894 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3895 if (!unicode)
3896 return NULL;
3897 /* When the string is ASCII only, just use memcpy and return.
3898 unicode_size may be != size if there is an incomplete UTF-8
3899 sequence at the end of the ASCII block. */
3900 if (maxchar < 128 && size == unicode_size) {
3901 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3902 return (PyObject *)unicode;
3903 }
3904 kind = PyUnicode_KIND(unicode);
3905 data = PyUnicode_DATA(unicode);
3906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003907 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003910 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911
3912 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003913 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914
3915 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003916 /* Fast path for runs of ASCII characters. Given that common UTF-8
3917 input will consist of an overwhelming majority of ASCII
3918 characters, we try to optimize for this case by checking
3919 as many characters as a C 'long' can contain.
3920 First, check if we can do an aligned read, as most CPUs have
3921 a penalty for unaligned reads.
3922 */
3923 if (!((size_t) s & LONG_PTR_MASK)) {
3924 /* Help register allocation */
3925 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003927 while (_s < aligned_end) {
3928 /* Read a whole long at a time (either 4 or 8 bytes),
3929 and do a fast unrolled copy if it only contains ASCII
3930 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931 unsigned long value = *(unsigned long *) _s;
3932 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003933 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3935 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3936 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3937 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003938#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3940 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3941 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3942 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003943#endif
3944 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003946 }
3947 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003949 if (s == e)
3950 break;
3951 ch = (unsigned char)*s;
3952 }
3953 }
3954
3955 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 s++;
3958 continue;
3959 }
3960
3961 n = utf8_code_length[ch];
3962
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003963 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 if (consumed)
3965 break;
3966 else {
3967 errmsg = "unexpected end of data";
3968 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003969 endinpos = startinpos+1;
3970 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3971 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003972 goto utf8Error;
3973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003974 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975
3976 switch (n) {
3977
3978 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003979 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 startinpos = s-starts;
3981 endinpos = startinpos+1;
3982 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983
3984 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003985 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 startinpos = s-starts;
3987 endinpos = startinpos+1;
3988 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989
3990 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003991 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003992 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003994 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 goto utf8Error;
3996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003998 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 break;
4001
4002 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004003 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4004 will result in surrogates in range d800-dfff. Surrogates are
4005 not valid UTF-8 so they are rejected.
4006 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4007 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004008 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004009 (s[2] & 0xc0) != 0x80 ||
4010 ((unsigned char)s[0] == 0xE0 &&
4011 (unsigned char)s[1] < 0xA0) ||
4012 ((unsigned char)s[0] == 0xED &&
4013 (unsigned char)s[1] > 0x9F)) {
4014 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004016 endinpos = startinpos + 1;
4017
4018 /* if s[1] first two bits are 1 and 0, then the invalid
4019 continuation byte is s[2], so increment endinpos by 1,
4020 if not, s[1] is invalid and endinpos doesn't need to
4021 be incremented. */
4022 if ((s[1] & 0xC0) == 0x80)
4023 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 goto utf8Error;
4025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004027 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004029 break;
4030
4031 case 4:
4032 if ((s[1] & 0xc0) != 0x80 ||
4033 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004034 (s[3] & 0xc0) != 0x80 ||
4035 ((unsigned char)s[0] == 0xF0 &&
4036 (unsigned char)s[1] < 0x90) ||
4037 ((unsigned char)s[0] == 0xF4 &&
4038 (unsigned char)s[1] > 0x8F)) {
4039 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004041 endinpos = startinpos + 1;
4042 if ((s[1] & 0xC0) == 0x80) {
4043 endinpos++;
4044 if ((s[2] & 0xC0) == 0x80)
4045 endinpos++;
4046 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 goto utf8Error;
4048 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004049 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004050 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4051 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 /* If the string is flexible or we have native UCS-4, write
4054 directly.. */
4055 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4056 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 else {
4059 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 /* translate from 10000..10FFFF to 0..FFFF */
4062 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 /* high surrogate = top 10 bits added to D800 */
4065 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4066 (Py_UNICODE)(0xD800 + (ch >> 10)));
4067
4068 /* low surrogate = bottom 10 bits added to DC00 */
4069 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4070 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4071 }
4072#if SIZEOF_WCHAR_T == 2
4073 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004074#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 }
4077 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004079
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 /* If this is not yet a resizable string, make it one.. */
4082 if (kind != PyUnicode_WCHAR_KIND) {
4083 const Py_UNICODE *u;
4084 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4085 if (!new_unicode)
4086 goto onError;
4087 u = PyUnicode_AsUnicode((PyObject *)unicode);
4088 if (!u)
4089 goto onError;
4090#if SIZEOF_WCHAR_T == 2
4091 i += wchar_offset;
4092#endif
4093 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4094 Py_DECREF(unicode);
4095 unicode = new_unicode;
4096 kind = 0;
4097 data = PyUnicode_AS_UNICODE(new_unicode);
4098 assert(data != NULL);
4099 }
4100 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 if (unicode_decode_call_errorhandler(
4102 errors, &errorHandler,
4103 "utf8", errmsg,
4104 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 /* Update data because unicode_decode_call_errorhandler might have
4108 re-created or resized the unicode object. */
4109 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112 /* Ensure the unicode_size calculation above was correct: */
4113 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4114
Walter Dörwald69652032004-09-07 20:24:22 +00004115 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 /* Adjust length and ready string when it contained errors and
4119 is of the old resizable kind. */
4120 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004121 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122 PyUnicode_READY(unicode) == -1)
4123 goto onError;
4124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 Py_XDECREF(errorHandler);
4127 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 if (PyUnicode_READY(unicode) == -1) {
4129 Py_DECREF(unicode);
4130 return NULL;
4131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 return (PyObject *)unicode;
4133
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 Py_XDECREF(errorHandler);
4136 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 Py_DECREF(unicode);
4138 return NULL;
4139}
4140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004142
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004143#ifdef __APPLE__
4144
4145/* Simplified UTF-8 decoder using surrogateescape error handler,
4146 used to decode the command line arguments on Mac OS X. */
4147
4148wchar_t*
4149_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4150{
4151 int n;
4152 const char *e;
4153 wchar_t *unicode, *p;
4154
4155 /* Note: size will always be longer than the resulting Unicode
4156 character count */
4157 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4158 PyErr_NoMemory();
4159 return NULL;
4160 }
4161 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4162 if (!unicode)
4163 return NULL;
4164
4165 /* Unpack UTF-8 encoded data */
4166 p = unicode;
4167 e = s + size;
4168 while (s < e) {
4169 Py_UCS4 ch = (unsigned char)*s;
4170
4171 if (ch < 0x80) {
4172 *p++ = (wchar_t)ch;
4173 s++;
4174 continue;
4175 }
4176
4177 n = utf8_code_length[ch];
4178 if (s + n > e) {
4179 goto surrogateescape;
4180 }
4181
4182 switch (n) {
4183 case 0:
4184 case 1:
4185 goto surrogateescape;
4186
4187 case 2:
4188 if ((s[1] & 0xc0) != 0x80)
4189 goto surrogateescape;
4190 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4191 assert ((ch > 0x007F) && (ch <= 0x07FF));
4192 *p++ = (wchar_t)ch;
4193 break;
4194
4195 case 3:
4196 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4197 will result in surrogates in range d800-dfff. Surrogates are
4198 not valid UTF-8 so they are rejected.
4199 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4200 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4201 if ((s[1] & 0xc0) != 0x80 ||
4202 (s[2] & 0xc0) != 0x80 ||
4203 ((unsigned char)s[0] == 0xE0 &&
4204 (unsigned char)s[1] < 0xA0) ||
4205 ((unsigned char)s[0] == 0xED &&
4206 (unsigned char)s[1] > 0x9F)) {
4207
4208 goto surrogateescape;
4209 }
4210 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4211 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004212 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004213 break;
4214
4215 case 4:
4216 if ((s[1] & 0xc0) != 0x80 ||
4217 (s[2] & 0xc0) != 0x80 ||
4218 (s[3] & 0xc0) != 0x80 ||
4219 ((unsigned char)s[0] == 0xF0 &&
4220 (unsigned char)s[1] < 0x90) ||
4221 ((unsigned char)s[0] == 0xF4 &&
4222 (unsigned char)s[1] > 0x8F)) {
4223 goto surrogateescape;
4224 }
4225 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4226 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4227 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4228
4229#if SIZEOF_WCHAR_T == 4
4230 *p++ = (wchar_t)ch;
4231#else
4232 /* compute and append the two surrogates: */
4233
4234 /* translate from 10000..10FFFF to 0..FFFF */
4235 ch -= 0x10000;
4236
4237 /* high surrogate = top 10 bits added to D800 */
4238 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4239
4240 /* low surrogate = bottom 10 bits added to DC00 */
4241 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4242#endif
4243 break;
4244 }
4245 s += n;
4246 continue;
4247
4248 surrogateescape:
4249 *p++ = 0xDC00 + ch;
4250 s++;
4251 }
4252 *p = L'\0';
4253 return unicode;
4254}
4255
4256#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004258/* Primary internal function which creates utf8 encoded bytes objects.
4259
4260 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004261 and allocate exactly as much space needed at the end. Else allocate the
4262 maximum possible needed (4 result bytes per Unicode character), and return
4263 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004264*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004265PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267{
Tim Peters602f7402002-04-27 18:03:26 +00004268#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004269
Guido van Rossum98297ee2007-11-06 21:34:58 +00004270 Py_ssize_t i; /* index into s of next input byte */
4271 PyObject *result; /* result string object */
4272 char *p; /* next free byte in output buffer */
4273 Py_ssize_t nallocated; /* number of result bytes allocated */
4274 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004275 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004276 PyObject *errorHandler = NULL;
4277 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 int kind;
4279 void *data;
4280 Py_ssize_t size;
4281 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4282#if SIZEOF_WCHAR_T == 2
4283 Py_ssize_t wchar_offset = 0;
4284#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 if (!PyUnicode_Check(unicode)) {
4287 PyErr_BadArgument();
4288 return NULL;
4289 }
4290
4291 if (PyUnicode_READY(unicode) == -1)
4292 return NULL;
4293
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004294 if (PyUnicode_UTF8(unicode))
4295 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4296 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297
4298 kind = PyUnicode_KIND(unicode);
4299 data = PyUnicode_DATA(unicode);
4300 size = PyUnicode_GET_LENGTH(unicode);
4301
Tim Peters602f7402002-04-27 18:03:26 +00004302 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303
Tim Peters602f7402002-04-27 18:03:26 +00004304 if (size <= MAX_SHORT_UNICHARS) {
4305 /* Write into the stack buffer; nallocated can't overflow.
4306 * At the end, we'll allocate exactly as much heap space as it
4307 * turns out we need.
4308 */
4309 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004310 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004311 p = stackbuf;
4312 }
4313 else {
4314 /* Overallocate on the heap, and give the excess back at the end. */
4315 nallocated = size * 4;
4316 if (nallocated / 4 != size) /* overflow! */
4317 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004318 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004319 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004320 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004321 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004322 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004323
Tim Peters602f7402002-04-27 18:03:26 +00004324 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004325 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004326
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004327 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004328 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004330
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004332 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004333 *p++ = (char)(0xc0 | (ch >> 6));
4334 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004335 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 Py_ssize_t newpos;
4337 PyObject *rep;
4338 Py_ssize_t repsize, k, startpos;
4339 startpos = i-1;
4340#if SIZEOF_WCHAR_T == 2
4341 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004342#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343 rep = unicode_encode_call_errorhandler(
4344 errors, &errorHandler, "utf-8", "surrogates not allowed",
4345 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4346 &exc, startpos, startpos+1, &newpos);
4347 if (!rep)
4348 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004350 if (PyBytes_Check(rep))
4351 repsize = PyBytes_GET_SIZE(rep);
4352 else
4353 repsize = PyUnicode_GET_SIZE(rep);
4354
4355 if (repsize > 4) {
4356 Py_ssize_t offset;
4357
4358 if (result == NULL)
4359 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004360 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004361 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4364 /* integer overflow */
4365 PyErr_NoMemory();
4366 goto error;
4367 }
4368 nallocated += repsize - 4;
4369 if (result != NULL) {
4370 if (_PyBytes_Resize(&result, nallocated) < 0)
4371 goto error;
4372 } else {
4373 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004374 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004375 goto error;
4376 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4377 }
4378 p = PyBytes_AS_STRING(result) + offset;
4379 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004381 if (PyBytes_Check(rep)) {
4382 char *prep = PyBytes_AS_STRING(rep);
4383 for(k = repsize; k > 0; k--)
4384 *p++ = *prep++;
4385 } else /* rep is unicode */ {
4386 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4387 Py_UNICODE c;
4388
4389 for(k=0; k<repsize; k++) {
4390 c = prep[k];
4391 if (0x80 <= c) {
4392 raise_encode_exception(&exc, "utf-8",
4393 PyUnicode_AS_UNICODE(unicode),
4394 size, i-1, i,
4395 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004396 goto error;
4397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004398 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004399 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004401 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004402 } else if (ch < 0x10000) {
4403 *p++ = (char)(0xe0 | (ch >> 12));
4404 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4405 *p++ = (char)(0x80 | (ch & 0x3f));
4406 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004407 /* Encode UCS4 Unicode ordinals */
4408 *p++ = (char)(0xf0 | (ch >> 18));
4409 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4410 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4411 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004412#if SIZEOF_WCHAR_T == 2
4413 wchar_offset++;
4414#endif
Tim Peters602f7402002-04-27 18:03:26 +00004415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004417
Guido van Rossum98297ee2007-11-06 21:34:58 +00004418 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004419 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004420 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004421 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004422 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004423 }
4424 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004425 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004426 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004427 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004428 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004430
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004431 Py_XDECREF(errorHandler);
4432 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004433 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004434 error:
4435 Py_XDECREF(errorHandler);
4436 Py_XDECREF(exc);
4437 Py_XDECREF(result);
4438 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004439
Tim Peters602f7402002-04-27 18:03:26 +00004440#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441}
4442
Alexander Belopolsky40018472011-02-26 01:02:56 +00004443PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004444PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4445 Py_ssize_t size,
4446 const char *errors)
4447{
4448 PyObject *v, *unicode;
4449
4450 unicode = PyUnicode_FromUnicode(s, size);
4451 if (unicode == NULL)
4452 return NULL;
4453 v = _PyUnicode_AsUTF8String(unicode, errors);
4454 Py_DECREF(unicode);
4455 return v;
4456}
4457
4458PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004459PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004461 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462}
4463
Walter Dörwald41980ca2007-08-16 21:55:45 +00004464/* --- UTF-32 Codec ------------------------------------------------------- */
4465
4466PyObject *
4467PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004468 Py_ssize_t size,
4469 const char *errors,
4470 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004471{
4472 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4473}
4474
4475PyObject *
4476PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 Py_ssize_t size,
4478 const char *errors,
4479 int *byteorder,
4480 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004481{
4482 const char *starts = s;
4483 Py_ssize_t startinpos;
4484 Py_ssize_t endinpos;
4485 Py_ssize_t outpos;
4486 PyUnicodeObject *unicode;
4487 Py_UNICODE *p;
4488#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004489 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004490 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004491#else
4492 const int pairs = 0;
4493#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004494 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004495 int bo = 0; /* assume native ordering by default */
4496 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004497 /* Offsets from q for retrieving bytes in the right order. */
4498#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4499 int iorder[] = {0, 1, 2, 3};
4500#else
4501 int iorder[] = {3, 2, 1, 0};
4502#endif
4503 PyObject *errorHandler = NULL;
4504 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004505
Walter Dörwald41980ca2007-08-16 21:55:45 +00004506 q = (unsigned char *)s;
4507 e = q + size;
4508
4509 if (byteorder)
4510 bo = *byteorder;
4511
4512 /* Check for BOM marks (U+FEFF) in the input and adjust current
4513 byte order setting accordingly. In native mode, the leading BOM
4514 mark is skipped, in all other modes, it is copied to the output
4515 stream as-is (giving a ZWNBSP character). */
4516 if (bo == 0) {
4517 if (size >= 4) {
4518 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004520#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 if (bom == 0x0000FEFF) {
4522 q += 4;
4523 bo = -1;
4524 }
4525 else if (bom == 0xFFFE0000) {
4526 q += 4;
4527 bo = 1;
4528 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004529#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 if (bom == 0x0000FEFF) {
4531 q += 4;
4532 bo = 1;
4533 }
4534 else if (bom == 0xFFFE0000) {
4535 q += 4;
4536 bo = -1;
4537 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004538#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004540 }
4541
4542 if (bo == -1) {
4543 /* force LE */
4544 iorder[0] = 0;
4545 iorder[1] = 1;
4546 iorder[2] = 2;
4547 iorder[3] = 3;
4548 }
4549 else if (bo == 1) {
4550 /* force BE */
4551 iorder[0] = 3;
4552 iorder[1] = 2;
4553 iorder[2] = 1;
4554 iorder[3] = 0;
4555 }
4556
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004557 /* On narrow builds we split characters outside the BMP into two
4558 codepoints => count how much extra space we need. */
4559#ifndef Py_UNICODE_WIDE
4560 for (qq = q; qq < e; qq += 4)
4561 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4562 pairs++;
4563#endif
4564
4565 /* This might be one to much, because of a BOM */
4566 unicode = _PyUnicode_New((size+3)/4+pairs);
4567 if (!unicode)
4568 return NULL;
4569 if (size == 0)
4570 return (PyObject *)unicode;
4571
4572 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004573 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004574
Walter Dörwald41980ca2007-08-16 21:55:45 +00004575 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 Py_UCS4 ch;
4577 /* remaining bytes at the end? (size should be divisible by 4) */
4578 if (e-q<4) {
4579 if (consumed)
4580 break;
4581 errmsg = "truncated data";
4582 startinpos = ((const char *)q)-starts;
4583 endinpos = ((const char *)e)-starts;
4584 goto utf32Error;
4585 /* The remaining input chars are ignored if the callback
4586 chooses to skip the input */
4587 }
4588 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4589 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004590
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 if (ch >= 0x110000)
4592 {
4593 errmsg = "codepoint not in range(0x110000)";
4594 startinpos = ((const char *)q)-starts;
4595 endinpos = startinpos+4;
4596 goto utf32Error;
4597 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004598#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 if (ch >= 0x10000)
4600 {
4601 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4602 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4603 }
4604 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004605#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 *p++ = ch;
4607 q += 4;
4608 continue;
4609 utf32Error:
4610 outpos = p-PyUnicode_AS_UNICODE(unicode);
4611 if (unicode_decode_call_errorhandler(
4612 errors, &errorHandler,
4613 "utf32", errmsg,
4614 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4615 &unicode, &outpos, &p))
4616 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004617 }
4618
4619 if (byteorder)
4620 *byteorder = bo;
4621
4622 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004623 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004624
4625 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004626 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004627 goto onError;
4628
4629 Py_XDECREF(errorHandler);
4630 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004631 if (PyUnicode_READY(unicode) == -1) {
4632 Py_DECREF(unicode);
4633 return NULL;
4634 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004635 return (PyObject *)unicode;
4636
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004638 Py_DECREF(unicode);
4639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
4641 return NULL;
4642}
4643
4644PyObject *
4645PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004646 Py_ssize_t size,
4647 const char *errors,
4648 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004649{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004650 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004651 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004652 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004653#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004654 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004655#else
4656 const int pairs = 0;
4657#endif
4658 /* Offsets from p for storing byte pairs in the right order. */
4659#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4660 int iorder[] = {0, 1, 2, 3};
4661#else
4662 int iorder[] = {3, 2, 1, 0};
4663#endif
4664
Benjamin Peterson29060642009-01-31 22:14:21 +00004665#define STORECHAR(CH) \
4666 do { \
4667 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4668 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4669 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4670 p[iorder[0]] = (CH) & 0xff; \
4671 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004672 } while(0)
4673
4674 /* In narrow builds we can output surrogate pairs as one codepoint,
4675 so we need less space. */
4676#ifndef Py_UNICODE_WIDE
4677 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4679 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4680 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004681#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004682 nsize = (size - pairs + (byteorder == 0));
4683 bytesize = nsize * 4;
4684 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004686 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004687 if (v == NULL)
4688 return NULL;
4689
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004690 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004691 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004693 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004694 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004695
4696 if (byteorder == -1) {
4697 /* force LE */
4698 iorder[0] = 0;
4699 iorder[1] = 1;
4700 iorder[2] = 2;
4701 iorder[3] = 3;
4702 }
4703 else if (byteorder == 1) {
4704 /* force BE */
4705 iorder[0] = 3;
4706 iorder[1] = 2;
4707 iorder[2] = 1;
4708 iorder[3] = 0;
4709 }
4710
4711 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004713#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4715 Py_UCS4 ch2 = *s;
4716 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4717 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4718 s++;
4719 size--;
4720 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004721 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004722#endif
4723 STORECHAR(ch);
4724 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004725
4726 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004727 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004728#undef STORECHAR
4729}
4730
Alexander Belopolsky40018472011-02-26 01:02:56 +00004731PyObject *
4732PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004733{
4734 if (!PyUnicode_Check(unicode)) {
4735 PyErr_BadArgument();
4736 return NULL;
4737 }
4738 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 PyUnicode_GET_SIZE(unicode),
4740 NULL,
4741 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004742}
4743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744/* --- UTF-16 Codec ------------------------------------------------------- */
4745
Tim Peters772747b2001-08-09 22:21:55 +00004746PyObject *
4747PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004748 Py_ssize_t size,
4749 const char *errors,
4750 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Walter Dörwald69652032004-09-07 20:24:22 +00004752 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4753}
4754
Antoine Pitrouab868312009-01-10 15:40:25 +00004755/* Two masks for fast checking of whether a C 'long' may contain
4756 UTF16-encoded surrogate characters. This is an efficient heuristic,
4757 assuming that non-surrogate characters with a code point >= 0x8000 are
4758 rare in most input.
4759 FAST_CHAR_MASK is used when the input is in native byte ordering,
4760 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004761*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004762#if (SIZEOF_LONG == 8)
4763# define FAST_CHAR_MASK 0x8000800080008000L
4764# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4765#elif (SIZEOF_LONG == 4)
4766# define FAST_CHAR_MASK 0x80008000L
4767# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4768#else
4769# error C 'long' size should be either 4 or 8!
4770#endif
4771
Walter Dörwald69652032004-09-07 20:24:22 +00004772PyObject *
4773PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004774 Py_ssize_t size,
4775 const char *errors,
4776 int *byteorder,
4777 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004780 Py_ssize_t startinpos;
4781 Py_ssize_t endinpos;
4782 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 PyUnicodeObject *unicode;
4784 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004785 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004786 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004787 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004788 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004789 /* Offsets from q for retrieving byte pairs in the right order. */
4790#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4791 int ihi = 1, ilo = 0;
4792#else
4793 int ihi = 0, ilo = 1;
4794#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795 PyObject *errorHandler = NULL;
4796 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
4798 /* Note: size will always be longer than the resulting Unicode
4799 character count */
4800 unicode = _PyUnicode_New(size);
4801 if (!unicode)
4802 return NULL;
4803 if (size == 0)
4804 return (PyObject *)unicode;
4805
4806 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004808 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004809 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
4811 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004812 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004814 /* Check for BOM marks (U+FEFF) in the input and adjust current
4815 byte order setting accordingly. In native mode, the leading BOM
4816 mark is skipped, in all other modes, it is copied to the output
4817 stream as-is (giving a ZWNBSP character). */
4818 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004819 if (size >= 2) {
4820 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004821#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 if (bom == 0xFEFF) {
4823 q += 2;
4824 bo = -1;
4825 }
4826 else if (bom == 0xFFFE) {
4827 q += 2;
4828 bo = 1;
4829 }
Tim Petersced69f82003-09-16 20:30:58 +00004830#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 if (bom == 0xFEFF) {
4832 q += 2;
4833 bo = 1;
4834 }
4835 else if (bom == 0xFFFE) {
4836 q += 2;
4837 bo = -1;
4838 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004839#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842
Tim Peters772747b2001-08-09 22:21:55 +00004843 if (bo == -1) {
4844 /* force LE */
4845 ihi = 1;
4846 ilo = 0;
4847 }
4848 else if (bo == 1) {
4849 /* force BE */
4850 ihi = 0;
4851 ilo = 1;
4852 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004853#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4854 native_ordering = ilo < ihi;
4855#else
4856 native_ordering = ilo > ihi;
4857#endif
Tim Peters772747b2001-08-09 22:21:55 +00004858
Antoine Pitrouab868312009-01-10 15:40:25 +00004859 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004860 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004862 /* First check for possible aligned read of a C 'long'. Unaligned
4863 reads are more expensive, better to defer to another iteration. */
4864 if (!((size_t) q & LONG_PTR_MASK)) {
4865 /* Fast path for runs of non-surrogate chars. */
4866 register const unsigned char *_q = q;
4867 Py_UNICODE *_p = p;
4868 if (native_ordering) {
4869 /* Native ordering is simple: as long as the input cannot
4870 possibly contain a surrogate char, do an unrolled copy
4871 of several 16-bit code points to the target object.
4872 The non-surrogate check is done on several input bytes
4873 at a time (as many as a C 'long' can contain). */
4874 while (_q < aligned_end) {
4875 unsigned long data = * (unsigned long *) _q;
4876 if (data & FAST_CHAR_MASK)
4877 break;
4878 _p[0] = ((unsigned short *) _q)[0];
4879 _p[1] = ((unsigned short *) _q)[1];
4880#if (SIZEOF_LONG == 8)
4881 _p[2] = ((unsigned short *) _q)[2];
4882 _p[3] = ((unsigned short *) _q)[3];
4883#endif
4884 _q += SIZEOF_LONG;
4885 _p += SIZEOF_LONG / 2;
4886 }
4887 }
4888 else {
4889 /* Byteswapped ordering is similar, but we must decompose
4890 the copy bytewise, and take care of zero'ing out the
4891 upper bytes if the target object is in 32-bit units
4892 (that is, in UCS-4 builds). */
4893 while (_q < aligned_end) {
4894 unsigned long data = * (unsigned long *) _q;
4895 if (data & SWAPPED_FAST_CHAR_MASK)
4896 break;
4897 /* Zero upper bytes in UCS-4 builds */
4898#if (Py_UNICODE_SIZE > 2)
4899 _p[0] = 0;
4900 _p[1] = 0;
4901#if (SIZEOF_LONG == 8)
4902 _p[2] = 0;
4903 _p[3] = 0;
4904#endif
4905#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004906 /* Issue #4916; UCS-4 builds on big endian machines must
4907 fill the two last bytes of each 4-byte unit. */
4908#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4909# define OFF 2
4910#else
4911# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004912#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004913 ((unsigned char *) _p)[OFF + 1] = _q[0];
4914 ((unsigned char *) _p)[OFF + 0] = _q[1];
4915 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4916 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4917#if (SIZEOF_LONG == 8)
4918 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4919 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4920 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4921 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4922#endif
4923#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004924 _q += SIZEOF_LONG;
4925 _p += SIZEOF_LONG / 2;
4926 }
4927 }
4928 p = _p;
4929 q = _q;
4930 if (q >= e)
4931 break;
4932 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004934
Benjamin Peterson14339b62009-01-31 16:36:08 +00004935 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004936
4937 if (ch < 0xD800 || ch > 0xDFFF) {
4938 *p++ = ch;
4939 continue;
4940 }
4941
4942 /* UTF-16 code pair: */
4943 if (q > e) {
4944 errmsg = "unexpected end of data";
4945 startinpos = (((const char *)q) - 2) - starts;
4946 endinpos = ((const char *)e) + 1 - starts;
4947 goto utf16Error;
4948 }
4949 if (0xD800 <= ch && ch <= 0xDBFF) {
4950 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4951 q += 2;
4952 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004953#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 *p++ = ch;
4955 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004956#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004958#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 continue;
4960 }
4961 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004962 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 startinpos = (((const char *)q)-4)-starts;
4964 endinpos = startinpos+2;
4965 goto utf16Error;
4966 }
4967
Benjamin Peterson14339b62009-01-31 16:36:08 +00004968 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 errmsg = "illegal encoding";
4970 startinpos = (((const char *)q)-2)-starts;
4971 endinpos = startinpos+2;
4972 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004973
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 utf16Error:
4975 outpos = p - PyUnicode_AS_UNICODE(unicode);
4976 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004977 errors,
4978 &errorHandler,
4979 "utf16", errmsg,
4980 &starts,
4981 (const char **)&e,
4982 &startinpos,
4983 &endinpos,
4984 &exc,
4985 (const char **)&q,
4986 &unicode,
4987 &outpos,
4988 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004991 /* remaining byte at the end? (size should be even) */
4992 if (e == q) {
4993 if (!consumed) {
4994 errmsg = "truncated data";
4995 startinpos = ((const char *)q) - starts;
4996 endinpos = ((const char *)e) + 1 - starts;
4997 outpos = p - PyUnicode_AS_UNICODE(unicode);
4998 if (unicode_decode_call_errorhandler(
4999 errors,
5000 &errorHandler,
5001 "utf16", errmsg,
5002 &starts,
5003 (const char **)&e,
5004 &startinpos,
5005 &endinpos,
5006 &exc,
5007 (const char **)&q,
5008 &unicode,
5009 &outpos,
5010 &p))
5011 goto onError;
5012 /* The remaining input chars are ignored if the callback
5013 chooses to skip the input */
5014 }
5015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
5017 if (byteorder)
5018 *byteorder = bo;
5019
Walter Dörwald69652032004-09-07 20:24:22 +00005020 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005022
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005024 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 goto onError;
5026
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005027 Py_XDECREF(errorHandler);
5028 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005029 if (PyUnicode_READY(unicode) == -1) {
5030 Py_DECREF(unicode);
5031 return NULL;
5032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 return (PyObject *)unicode;
5034
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 Py_XDECREF(errorHandler);
5038 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 return NULL;
5040}
5041
Antoine Pitrouab868312009-01-10 15:40:25 +00005042#undef FAST_CHAR_MASK
5043#undef SWAPPED_FAST_CHAR_MASK
5044
Tim Peters772747b2001-08-09 22:21:55 +00005045PyObject *
5046PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 Py_ssize_t size,
5048 const char *errors,
5049 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005051 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005052 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005053 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005054#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005055 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005056#else
5057 const int pairs = 0;
5058#endif
Tim Peters772747b2001-08-09 22:21:55 +00005059 /* Offsets from p for storing byte pairs in the right order. */
5060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5061 int ihi = 1, ilo = 0;
5062#else
5063 int ihi = 0, ilo = 1;
5064#endif
5065
Benjamin Peterson29060642009-01-31 22:14:21 +00005066#define STORECHAR(CH) \
5067 do { \
5068 p[ihi] = ((CH) >> 8) & 0xff; \
5069 p[ilo] = (CH) & 0xff; \
5070 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005071 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005073#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005074 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 if (s[i] >= 0x10000)
5076 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005077#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005078 /* 2 * (size + pairs + (byteorder == 0)) */
5079 if (size > PY_SSIZE_T_MAX ||
5080 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005081 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005082 nsize = size + pairs + (byteorder == 0);
5083 bytesize = nsize * 2;
5084 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005086 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 if (v == NULL)
5088 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005090 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005093 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005094 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005095
5096 if (byteorder == -1) {
5097 /* force LE */
5098 ihi = 1;
5099 ilo = 0;
5100 }
5101 else if (byteorder == 1) {
5102 /* force BE */
5103 ihi = 0;
5104 ilo = 1;
5105 }
5106
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005107 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005108 Py_UNICODE ch = *s++;
5109 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005110#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 if (ch >= 0x10000) {
5112 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5113 ch = 0xD800 | ((ch-0x10000) >> 10);
5114 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005115#endif
Tim Peters772747b2001-08-09 22:21:55 +00005116 STORECHAR(ch);
5117 if (ch2)
5118 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005119 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005120
5121 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005122 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005123#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124}
5125
Alexander Belopolsky40018472011-02-26 01:02:56 +00005126PyObject *
5127PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128{
5129 if (!PyUnicode_Check(unicode)) {
5130 PyErr_BadArgument();
5131 return NULL;
5132 }
5133 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 PyUnicode_GET_SIZE(unicode),
5135 NULL,
5136 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137}
5138
5139/* --- Unicode Escape Codec ----------------------------------------------- */
5140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005141/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5142 if all the escapes in the string make it still a valid ASCII string.
5143 Returns -1 if any escapes were found which cause the string to
5144 pop out of ASCII range. Otherwise returns the length of the
5145 required buffer to hold the string.
5146 */
5147Py_ssize_t
5148length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5149{
5150 const unsigned char *p = (const unsigned char *)s;
5151 const unsigned char *end = p + size;
5152 Py_ssize_t length = 0;
5153
5154 if (size < 0)
5155 return -1;
5156
5157 for (; p < end; ++p) {
5158 if (*p > 127) {
5159 /* Non-ASCII */
5160 return -1;
5161 }
5162 else if (*p != '\\') {
5163 /* Normal character */
5164 ++length;
5165 }
5166 else {
5167 /* Backslash-escape, check next char */
5168 ++p;
5169 /* Escape sequence reaches till end of string or
5170 non-ASCII follow-up. */
5171 if (p >= end || *p > 127)
5172 return -1;
5173 switch (*p) {
5174 case '\n':
5175 /* backslash + \n result in zero characters */
5176 break;
5177 case '\\': case '\'': case '\"':
5178 case 'b': case 'f': case 't':
5179 case 'n': case 'r': case 'v': case 'a':
5180 ++length;
5181 break;
5182 case '0': case '1': case '2': case '3':
5183 case '4': case '5': case '6': case '7':
5184 case 'x': case 'u': case 'U': case 'N':
5185 /* these do not guarantee ASCII characters */
5186 return -1;
5187 default:
5188 /* count the backslash + the other character */
5189 length += 2;
5190 }
5191 }
5192 }
5193 return length;
5194}
5195
5196/* Similar to PyUnicode_WRITE but either write into wstr field
5197 or treat string as ASCII. */
5198#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5199 do { \
5200 if ((kind) != PyUnicode_WCHAR_KIND) \
5201 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5202 else \
5203 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5204 } while (0)
5205
5206#define WRITE_WSTR(buf, index, value) \
5207 assert(kind == PyUnicode_WCHAR_KIND), \
5208 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5209
5210
Fredrik Lundh06d12682001-01-24 07:59:11 +00005211static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005212
Alexander Belopolsky40018472011-02-26 01:02:56 +00005213PyObject *
5214PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005215 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005219 Py_ssize_t startinpos;
5220 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005221 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005223 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005225 char* message;
5226 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 PyObject *errorHandler = NULL;
5228 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005229 Py_ssize_t ascii_length;
5230 Py_ssize_t i;
5231 int kind;
5232 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005234 ascii_length = length_of_escaped_ascii_string(s, size);
5235
5236 /* After length_of_escaped_ascii_string() there are two alternatives,
5237 either the string is pure ASCII with named escapes like \n, etc.
5238 and we determined it's exact size (common case)
5239 or it contains \x, \u, ... escape sequences. then we create a
5240 legacy wchar string and resize it at the end of this function. */
5241 if (ascii_length >= 0) {
5242 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5243 if (!v)
5244 goto onError;
5245 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5246 kind = PyUnicode_1BYTE_KIND;
5247 data = PyUnicode_DATA(v);
5248 }
5249 else {
5250 /* Escaped strings will always be longer than the resulting
5251 Unicode string, so we start with size here and then reduce the
5252 length after conversion to the true value.
5253 (but if the error callback returns a long replacement string
5254 we'll have to allocate more space) */
5255 v = _PyUnicode_New(size);
5256 if (!v)
5257 goto onError;
5258 kind = PyUnicode_WCHAR_KIND;
5259 data = PyUnicode_AS_UNICODE(v);
5260 }
5261
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 if (size == 0)
5263 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005264 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 while (s < end) {
5268 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005269 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005270 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005272 if (kind == PyUnicode_WCHAR_KIND) {
5273 assert(i < _PyUnicode_WSTR_LENGTH(v));
5274 }
5275 else {
5276 /* The only case in which i == ascii_length is a backslash
5277 followed by a newline. */
5278 assert(i <= ascii_length);
5279 }
5280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 /* Non-escape characters are interpreted as Unicode ordinals */
5282 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005283 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 continue;
5285 }
5286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 /* \ - Escapes */
5289 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005290 c = *s++;
5291 if (s > end)
5292 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005293
5294 if (kind == PyUnicode_WCHAR_KIND) {
5295 assert(i < _PyUnicode_WSTR_LENGTH(v));
5296 }
5297 else {
5298 /* The only case in which i == ascii_length is a backslash
5299 followed by a newline. */
5300 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5301 }
5302
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005303 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304
Benjamin Peterson29060642009-01-31 22:14:21 +00005305 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005307 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5308 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5309 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5310 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5311 /* FF */
5312 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5313 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5314 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5315 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5316 /* VT */
5317 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5318 /* BEL, not classic C */
5319 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005322 case '0': case '1': case '2': case '3':
5323 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005324 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005325 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005326 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005327 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005328 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005330 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 break;
5332
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 /* hex escapes */
5334 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005336 digits = 2;
5337 message = "truncated \\xXX escape";
5338 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005342 digits = 4;
5343 message = "truncated \\uXXXX escape";
5344 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005347 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005348 digits = 8;
5349 message = "truncated \\UXXXXXXXX escape";
5350 hexescape:
5351 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005352 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 if (s+digits>end) {
5354 endinpos = size;
5355 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 errors, &errorHandler,
5357 "unicodeescape", "end of string in escape sequence",
5358 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005362 goto nextByte;
5363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364 for (j = 0; j < digits; ++j) {
5365 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005366 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 endinpos = (s+j+1)-starts;
5368 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 errors, &errorHandler,
5371 "unicodeescape", message,
5372 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005374 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005375 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005377 }
5378 chr = (chr<<4) & ~0xF;
5379 if (c >= '0' && c <= '9')
5380 chr += c - '0';
5381 else if (c >= 'a' && c <= 'f')
5382 chr += 10 + c - 'a';
5383 else
5384 chr += 10 + c - 'A';
5385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005387 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 /* _decoding_error will have already written into the
5389 target buffer. */
5390 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005391 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005392 /* when we get here, chr is a 32-bit unicode character */
5393 if (chr <= 0xffff)
5394 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005395 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005396 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005397 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005398 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005399#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005400 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005401#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005402 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005403 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5404 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005405#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005406 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 errors, &errorHandler,
5411 "unicodeescape", "illegal Unicode character",
5412 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005413 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005414 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005415 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005416 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005417 break;
5418
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005420 case 'N':
5421 message = "malformed \\N character escape";
5422 if (ucnhash_CAPI == NULL) {
5423 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5425 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005426 if (ucnhash_CAPI == NULL)
5427 goto ucnhashError;
5428 }
5429 if (*s == '{') {
5430 const char *start = s+1;
5431 /* look for the closing brace */
5432 while (*s != '}' && s < end)
5433 s++;
5434 if (s > start && s < end && *s == '}') {
5435 /* found a name. look it up in the unicode database */
5436 message = "unknown Unicode character name";
5437 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5439 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005440 goto store;
5441 }
5442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005443 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 errors, &errorHandler,
5447 "unicodeescape", message,
5448 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005450 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005451 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005452 break;
5453
5454 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005455 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 message = "\\ at end of string";
5458 s--;
5459 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005460 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005461 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005462 errors, &errorHandler,
5463 "unicodeescape", message,
5464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005466 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005468 }
5469 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5471 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005472 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005473 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 /* Ensure the length prediction worked in case of ASCII strings */
5479 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5480
Victor Stinnerfe226c02011-10-03 03:52:20 +02005481 if (kind == PyUnicode_WCHAR_KIND)
5482 {
5483 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5484 goto onError;
5485 if (PyUnicode_READY(v) == -1)
5486 goto onError;
5487 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005488 Py_XDECREF(errorHandler);
5489 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005491
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005493 PyErr_SetString(
5494 PyExc_UnicodeError,
5495 "\\N escapes not supported (can't load unicodedata module)"
5496 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005497 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 Py_XDECREF(errorHandler);
5499 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005500 return NULL;
5501
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005504 Py_XDECREF(errorHandler);
5505 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 return NULL;
5507}
5508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509#undef WRITE_ASCII_OR_WSTR
5510#undef WRITE_WSTR
5511
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512/* Return a Unicode-Escape string version of the Unicode object.
5513
5514 If quotes is true, the string is enclosed in u"" or u'' quotes as
5515 appropriate.
5516
5517*/
5518
Walter Dörwald79e913e2007-05-12 11:08:06 +00005519static const char *hexdigits = "0123456789abcdef";
5520
Alexander Belopolsky40018472011-02-26 01:02:56 +00005521PyObject *
5522PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005523 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005525 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005528#ifdef Py_UNICODE_WIDE
5529 const Py_ssize_t expandsize = 10;
5530#else
5531 const Py_ssize_t expandsize = 6;
5532#endif
5533
Thomas Wouters89f507f2006-12-13 04:49:30 +00005534 /* XXX(nnorwitz): rather than over-allocating, it would be
5535 better to choose a different scheme. Perhaps scan the
5536 first N-chars of the string and allocate based on that size.
5537 */
5538 /* Initial allocation is based on the longest-possible unichr
5539 escape.
5540
5541 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5542 unichr, so in this case it's the longest unichr escape. In
5543 narrow (UTF-16) builds this is five chars per source unichr
5544 since there are two unichrs in the surrogate pair, so in narrow
5545 (UTF-16) builds it's not the longest unichr escape.
5546
5547 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5548 so in the narrow (UTF-16) build case it's the longest unichr
5549 escape.
5550 */
5551
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005552 if (size == 0)
5553 return PyBytes_FromStringAndSize(NULL, 0);
5554
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005555 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005557
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005558 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 2
5560 + expandsize*size
5561 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 if (repr == NULL)
5563 return NULL;
5564
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005565 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 while (size-- > 0) {
5568 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005569
Walter Dörwald79e913e2007-05-12 11:08:06 +00005570 /* Escape backslashes */
5571 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 *p++ = '\\';
5573 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005574 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005575 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005576
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005577#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005578 /* Map 21-bit characters to '\U00xxxxxx' */
5579 else if (ch >= 0x10000) {
5580 *p++ = '\\';
5581 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005582 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5583 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5584 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5585 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5586 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5587 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5588 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5589 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005591 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005592#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5594 else if (ch >= 0xD800 && ch < 0xDC00) {
5595 Py_UNICODE ch2;
5596 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005597
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 ch2 = *s++;
5599 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005600 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5602 *p++ = '\\';
5603 *p++ = 'U';
5604 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5605 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5606 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5607 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5608 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5609 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5610 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5611 *p++ = hexdigits[ucs & 0x0000000F];
5612 continue;
5613 }
5614 /* Fall through: isolated surrogates are copied as-is */
5615 s--;
5616 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005617 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005618#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005619
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005621 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 *p++ = '\\';
5623 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005624 *p++ = hexdigits[(ch >> 12) & 0x000F];
5625 *p++ = hexdigits[(ch >> 8) & 0x000F];
5626 *p++ = hexdigits[(ch >> 4) & 0x000F];
5627 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005629
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005630 /* Map special whitespace to '\t', \n', '\r' */
5631 else if (ch == '\t') {
5632 *p++ = '\\';
5633 *p++ = 't';
5634 }
5635 else if (ch == '\n') {
5636 *p++ = '\\';
5637 *p++ = 'n';
5638 }
5639 else if (ch == '\r') {
5640 *p++ = '\\';
5641 *p++ = 'r';
5642 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005643
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005644 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005645 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005647 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005648 *p++ = hexdigits[(ch >> 4) & 0x000F];
5649 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005650 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005651
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652 /* Copy everything else as-is */
5653 else
5654 *p++ = (char) ch;
5655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005657 assert(p - PyBytes_AS_STRING(repr) > 0);
5658 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5659 return NULL;
5660 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661}
5662
Alexander Belopolsky40018472011-02-26 01:02:56 +00005663PyObject *
5664PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005666 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 if (!PyUnicode_Check(unicode)) {
5668 PyErr_BadArgument();
5669 return NULL;
5670 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005671 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5672 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005673 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674}
5675
5676/* --- Raw Unicode Escape Codec ------------------------------------------- */
5677
Alexander Belopolsky40018472011-02-26 01:02:56 +00005678PyObject *
5679PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005680 Py_ssize_t size,
5681 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005684 Py_ssize_t startinpos;
5685 Py_ssize_t endinpos;
5686 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 const char *end;
5690 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 PyObject *errorHandler = NULL;
5692 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 /* Escaped strings will always be longer than the resulting
5695 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 length after conversion to the true value. (But decoding error
5697 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 v = _PyUnicode_New(size);
5699 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 end = s + size;
5705 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 unsigned char c;
5707 Py_UCS4 x;
5708 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005709 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 /* Non-escape characters are interpreted as Unicode ordinals */
5712 if (*s != '\\') {
5713 *p++ = (unsigned char)*s++;
5714 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005715 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 startinpos = s-starts;
5717
5718 /* \u-escapes are only interpreted iff the number of leading
5719 backslashes if odd */
5720 bs = s;
5721 for (;s < end;) {
5722 if (*s != '\\')
5723 break;
5724 *p++ = (unsigned char)*s++;
5725 }
5726 if (((s - bs) & 1) == 0 ||
5727 s >= end ||
5728 (*s != 'u' && *s != 'U')) {
5729 continue;
5730 }
5731 p--;
5732 count = *s=='u' ? 4 : 8;
5733 s++;
5734
5735 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5736 outpos = p-PyUnicode_AS_UNICODE(v);
5737 for (x = 0, i = 0; i < count; ++i, ++s) {
5738 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005739 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 endinpos = s-starts;
5741 if (unicode_decode_call_errorhandler(
5742 errors, &errorHandler,
5743 "rawunicodeescape", "truncated \\uXXXX",
5744 &starts, &end, &startinpos, &endinpos, &exc, &s,
5745 &v, &outpos, &p))
5746 goto onError;
5747 goto nextByte;
5748 }
5749 x = (x<<4) & ~0xF;
5750 if (c >= '0' && c <= '9')
5751 x += c - '0';
5752 else if (c >= 'a' && c <= 'f')
5753 x += 10 + c - 'a';
5754 else
5755 x += 10 + c - 'A';
5756 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005757 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 /* UCS-2 character */
5759 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005760 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 /* UCS-4 character. Either store directly, or as
5762 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005763#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005765#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 x -= 0x10000L;
5767 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5768 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005769#endif
5770 } else {
5771 endinpos = s-starts;
5772 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005773 if (unicode_decode_call_errorhandler(
5774 errors, &errorHandler,
5775 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 &starts, &end, &startinpos, &endinpos, &exc, &s,
5777 &v, &outpos, &p))
5778 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005779 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 nextByte:
5781 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005783 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 Py_XDECREF(errorHandler);
5786 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005787 if (PyUnicode_READY(v) == -1) {
5788 Py_DECREF(v);
5789 return NULL;
5790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005792
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 Py_XDECREF(errorHandler);
5796 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797 return NULL;
5798}
5799
Alexander Belopolsky40018472011-02-26 01:02:56 +00005800PyObject *
5801PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005802 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005804 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805 char *p;
5806 char *q;
5807
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005808#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005809 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005810#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005811 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005812#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005813
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005814 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005816
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005817 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 if (repr == NULL)
5819 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005820 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005821 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005823 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 while (size-- > 0) {
5825 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005826#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005827 /* Map 32-bit characters to '\Uxxxxxxxx' */
5828 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005829 *p++ = '\\';
5830 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005831 *p++ = hexdigits[(ch >> 28) & 0xf];
5832 *p++ = hexdigits[(ch >> 24) & 0xf];
5833 *p++ = hexdigits[(ch >> 20) & 0xf];
5834 *p++ = hexdigits[(ch >> 16) & 0xf];
5835 *p++ = hexdigits[(ch >> 12) & 0xf];
5836 *p++ = hexdigits[(ch >> 8) & 0xf];
5837 *p++ = hexdigits[(ch >> 4) & 0xf];
5838 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005839 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005840 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005841#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5843 if (ch >= 0xD800 && ch < 0xDC00) {
5844 Py_UNICODE ch2;
5845 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005846
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 ch2 = *s++;
5848 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005849 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5851 *p++ = '\\';
5852 *p++ = 'U';
5853 *p++ = hexdigits[(ucs >> 28) & 0xf];
5854 *p++ = hexdigits[(ucs >> 24) & 0xf];
5855 *p++ = hexdigits[(ucs >> 20) & 0xf];
5856 *p++ = hexdigits[(ucs >> 16) & 0xf];
5857 *p++ = hexdigits[(ucs >> 12) & 0xf];
5858 *p++ = hexdigits[(ucs >> 8) & 0xf];
5859 *p++ = hexdigits[(ucs >> 4) & 0xf];
5860 *p++ = hexdigits[ucs & 0xf];
5861 continue;
5862 }
5863 /* Fall through: isolated surrogates are copied as-is */
5864 s--;
5865 size++;
5866 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005867#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 /* Map 16-bit characters to '\uxxxx' */
5869 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870 *p++ = '\\';
5871 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005872 *p++ = hexdigits[(ch >> 12) & 0xf];
5873 *p++ = hexdigits[(ch >> 8) & 0xf];
5874 *p++ = hexdigits[(ch >> 4) & 0xf];
5875 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 /* Copy everything else as-is */
5878 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 *p++ = (char) ch;
5880 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005881 size = p - q;
5882
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005883 assert(size > 0);
5884 if (_PyBytes_Resize(&repr, size) < 0)
5885 return NULL;
5886 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887}
5888
Alexander Belopolsky40018472011-02-26 01:02:56 +00005889PyObject *
5890PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005892 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005894 PyErr_BadArgument();
5895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005897 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5898 PyUnicode_GET_SIZE(unicode));
5899
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005900 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901}
5902
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005903/* --- Unicode Internal Codec ------------------------------------------- */
5904
Alexander Belopolsky40018472011-02-26 01:02:56 +00005905PyObject *
5906_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005907 Py_ssize_t size,
5908 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005909{
5910 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005911 Py_ssize_t startinpos;
5912 Py_ssize_t endinpos;
5913 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005914 PyUnicodeObject *v;
5915 Py_UNICODE *p;
5916 const char *end;
5917 const char *reason;
5918 PyObject *errorHandler = NULL;
5919 PyObject *exc = NULL;
5920
Neal Norwitzd43069c2006-01-08 01:12:10 +00005921#ifdef Py_UNICODE_WIDE
5922 Py_UNICODE unimax = PyUnicode_GetMax();
5923#endif
5924
Thomas Wouters89f507f2006-12-13 04:49:30 +00005925 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005926 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5927 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005929 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5930 as string was created with the old API. */
5931 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005933 p = PyUnicode_AS_UNICODE(v);
5934 end = s + size;
5935
5936 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005937 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005938 /* We have to sanity check the raw data, otherwise doom looms for
5939 some malformed UCS-4 data. */
5940 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005941#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005942 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005943#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005944 end-s < Py_UNICODE_SIZE
5945 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005947 startinpos = s - starts;
5948 if (end-s < Py_UNICODE_SIZE) {
5949 endinpos = end-starts;
5950 reason = "truncated input";
5951 }
5952 else {
5953 endinpos = s - starts + Py_UNICODE_SIZE;
5954 reason = "illegal code point (> 0x10FFFF)";
5955 }
5956 outpos = p - PyUnicode_AS_UNICODE(v);
5957 if (unicode_decode_call_errorhandler(
5958 errors, &errorHandler,
5959 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005960 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005961 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005962 goto onError;
5963 }
5964 }
5965 else {
5966 p++;
5967 s += Py_UNICODE_SIZE;
5968 }
5969 }
5970
Victor Stinnerfe226c02011-10-03 03:52:20 +02005971 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005972 goto onError;
5973 Py_XDECREF(errorHandler);
5974 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005975 if (PyUnicode_READY(v) == -1) {
5976 Py_DECREF(v);
5977 return NULL;
5978 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005979 return (PyObject *)v;
5980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 Py_XDECREF(v);
5983 Py_XDECREF(errorHandler);
5984 Py_XDECREF(exc);
5985 return NULL;
5986}
5987
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988/* --- Latin-1 Codec ------------------------------------------------------ */
5989
Alexander Belopolsky40018472011-02-26 01:02:56 +00005990PyObject *
5991PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005992 Py_ssize_t size,
5993 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005996 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997}
5998
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005999/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006000static void
6001make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006002 const char *encoding,
6003 const Py_UNICODE *unicode, Py_ssize_t size,
6004 Py_ssize_t startpos, Py_ssize_t endpos,
6005 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 *exceptionObject = PyUnicodeEncodeError_Create(
6009 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 }
6011 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6013 goto onError;
6014 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6015 goto onError;
6016 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6017 goto onError;
6018 return;
6019 onError:
6020 Py_DECREF(*exceptionObject);
6021 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 }
6023}
6024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006025/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026static void
6027raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006028 const char *encoding,
6029 const Py_UNICODE *unicode, Py_ssize_t size,
6030 Py_ssize_t startpos, Py_ssize_t endpos,
6031 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032{
6033 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037}
6038
6039/* error handling callback helper:
6040 build arguments, call the callback and check the arguments,
6041 put the result into newpos and return the replacement string, which
6042 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006043static PyObject *
6044unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006045 PyObject **errorHandler,
6046 const char *encoding, const char *reason,
6047 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6048 Py_ssize_t startpos, Py_ssize_t endpos,
6049 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006051 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052
6053 PyObject *restuple;
6054 PyObject *resunicode;
6055
6056 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 }
6061
6062 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066
6067 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006072 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 Py_DECREF(restuple);
6074 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006076 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 &resunicode, newpos)) {
6078 Py_DECREF(restuple);
6079 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006081 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6082 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6083 Py_DECREF(restuple);
6084 return NULL;
6085 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006088 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6090 Py_DECREF(restuple);
6091 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006092 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 Py_INCREF(resunicode);
6094 Py_DECREF(restuple);
6095 return resunicode;
6096}
6097
Alexander Belopolsky40018472011-02-26 01:02:56 +00006098static PyObject *
6099unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006100 Py_ssize_t size,
6101 const char *errors,
6102 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103{
6104 /* output object */
6105 PyObject *res;
6106 /* pointers to the beginning and end+1 of input */
6107 const Py_UNICODE *startp = p;
6108 const Py_UNICODE *endp = p + size;
6109 /* pointer to the beginning of the unencodable characters */
6110 /* const Py_UNICODE *badp = NULL; */
6111 /* pointer into the output */
6112 char *str;
6113 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006114 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006115 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6116 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117 PyObject *errorHandler = NULL;
6118 PyObject *exc = NULL;
6119 /* the following variable is used for caching string comparisons
6120 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6121 int known_errorHandler = -1;
6122
6123 /* allocate enough for a simple encoding without
6124 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006125 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006126 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006127 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006129 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006130 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 ressize = size;
6132
6133 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* can we encode this? */
6137 if (c<limit) {
6138 /* no overflow check, because we know that the space is enough */
6139 *str++ = (char)c;
6140 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006141 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 else {
6143 Py_ssize_t unicodepos = p-startp;
6144 Py_ssize_t requiredsize;
6145 PyObject *repunicode;
6146 Py_ssize_t repsize;
6147 Py_ssize_t newpos;
6148 Py_ssize_t respos;
6149 Py_UNICODE *uni2;
6150 /* startpos for collecting unencodable chars */
6151 const Py_UNICODE *collstart = p;
6152 const Py_UNICODE *collend = p;
6153 /* find all unecodable characters */
6154 while ((collend < endp) && ((*collend)>=limit))
6155 ++collend;
6156 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6157 if (known_errorHandler==-1) {
6158 if ((errors==NULL) || (!strcmp(errors, "strict")))
6159 known_errorHandler = 1;
6160 else if (!strcmp(errors, "replace"))
6161 known_errorHandler = 2;
6162 else if (!strcmp(errors, "ignore"))
6163 known_errorHandler = 3;
6164 else if (!strcmp(errors, "xmlcharrefreplace"))
6165 known_errorHandler = 4;
6166 else
6167 known_errorHandler = 0;
6168 }
6169 switch (known_errorHandler) {
6170 case 1: /* strict */
6171 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6172 goto onError;
6173 case 2: /* replace */
6174 while (collstart++<collend)
6175 *str++ = '?'; /* fall through */
6176 case 3: /* ignore */
6177 p = collend;
6178 break;
6179 case 4: /* xmlcharrefreplace */
6180 respos = str - PyBytes_AS_STRING(res);
6181 /* determine replacement size (temporarily (mis)uses p) */
6182 for (p = collstart, repsize = 0; p < collend; ++p) {
6183 if (*p<10)
6184 repsize += 2+1+1;
6185 else if (*p<100)
6186 repsize += 2+2+1;
6187 else if (*p<1000)
6188 repsize += 2+3+1;
6189 else if (*p<10000)
6190 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006191#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 else
6193 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006194#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 else if (*p<100000)
6196 repsize += 2+5+1;
6197 else if (*p<1000000)
6198 repsize += 2+6+1;
6199 else
6200 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006201#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 }
6203 requiredsize = respos+repsize+(endp-collend);
6204 if (requiredsize > ressize) {
6205 if (requiredsize<2*ressize)
6206 requiredsize = 2*ressize;
6207 if (_PyBytes_Resize(&res, requiredsize))
6208 goto onError;
6209 str = PyBytes_AS_STRING(res) + respos;
6210 ressize = requiredsize;
6211 }
6212 /* generate replacement (temporarily (mis)uses p) */
6213 for (p = collstart; p < collend; ++p) {
6214 str += sprintf(str, "&#%d;", (int)*p);
6215 }
6216 p = collend;
6217 break;
6218 default:
6219 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6220 encoding, reason, startp, size, &exc,
6221 collstart-startp, collend-startp, &newpos);
6222 if (repunicode == NULL)
6223 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006224 if (PyBytes_Check(repunicode)) {
6225 /* Directly copy bytes result to output. */
6226 repsize = PyBytes_Size(repunicode);
6227 if (repsize > 1) {
6228 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006229 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006230 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6231 Py_DECREF(repunicode);
6232 goto onError;
6233 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006234 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006235 ressize += repsize-1;
6236 }
6237 memcpy(str, PyBytes_AsString(repunicode), repsize);
6238 str += repsize;
6239 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006240 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006241 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006242 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 /* need more space? (at least enough for what we
6244 have+the replacement+the rest of the string, so
6245 we won't have to check space for encodable characters) */
6246 respos = str - PyBytes_AS_STRING(res);
6247 repsize = PyUnicode_GET_SIZE(repunicode);
6248 requiredsize = respos+repsize+(endp-collend);
6249 if (requiredsize > ressize) {
6250 if (requiredsize<2*ressize)
6251 requiredsize = 2*ressize;
6252 if (_PyBytes_Resize(&res, requiredsize)) {
6253 Py_DECREF(repunicode);
6254 goto onError;
6255 }
6256 str = PyBytes_AS_STRING(res) + respos;
6257 ressize = requiredsize;
6258 }
6259 /* check if there is anything unencodable in the replacement
6260 and copy it to the output */
6261 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6262 c = *uni2;
6263 if (c >= limit) {
6264 raise_encode_exception(&exc, encoding, startp, size,
6265 unicodepos, unicodepos+1, reason);
6266 Py_DECREF(repunicode);
6267 goto onError;
6268 }
6269 *str = (char)c;
6270 }
6271 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006272 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 }
6275 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006276 /* Resize if we allocated to much */
6277 size = str - PyBytes_AS_STRING(res);
6278 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006279 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006280 if (_PyBytes_Resize(&res, size) < 0)
6281 goto onError;
6282 }
6283
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006284 Py_XDECREF(errorHandler);
6285 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006286 return res;
6287
6288 onError:
6289 Py_XDECREF(res);
6290 Py_XDECREF(errorHandler);
6291 Py_XDECREF(exc);
6292 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293}
6294
Alexander Belopolsky40018472011-02-26 01:02:56 +00006295PyObject *
6296PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006297 Py_ssize_t size,
6298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301}
6302
Alexander Belopolsky40018472011-02-26 01:02:56 +00006303PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006304_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305{
6306 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 PyErr_BadArgument();
6308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006310 if (PyUnicode_READY(unicode) == -1)
6311 return NULL;
6312 /* Fast path: if it is a one-byte string, construct
6313 bytes object directly. */
6314 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6315 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6316 PyUnicode_GET_LENGTH(unicode));
6317 /* Non-Latin-1 characters present. Defer to above function to
6318 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006321 errors);
6322}
6323
6324PyObject*
6325PyUnicode_AsLatin1String(PyObject *unicode)
6326{
6327 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328}
6329
6330/* --- 7-bit ASCII Codec -------------------------------------------------- */
6331
Alexander Belopolsky40018472011-02-26 01:02:56 +00006332PyObject *
6333PyUnicode_DecodeASCII(const char *s,
6334 Py_ssize_t size,
6335 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 PyUnicodeObject *v;
6339 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006340 Py_ssize_t startinpos;
6341 Py_ssize_t endinpos;
6342 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006343 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006344 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 PyObject *errorHandler = NULL;
6346 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006347 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006348
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006350 if (size == 1 && *(unsigned char*)s < 128)
6351 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6352
6353 /* Fast path. Assume the input actually *is* ASCII, and allocate
6354 a single-block Unicode object with that assumption. If there is
6355 an error, drop the object and start over. */
6356 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6357 if (v == NULL)
6358 goto onError;
6359 d = PyUnicode_1BYTE_DATA(v);
6360 for (i = 0; i < size; i++) {
6361 unsigned char ch = ((unsigned char*)s)[i];
6362 if (ch < 128)
6363 d[i] = ch;
6364 else
6365 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006367 if (i == size)
6368 return (PyObject*)v;
6369 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006370
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 v = _PyUnicode_New(size);
6372 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 e = s + size;
6378 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 register unsigned char c = (unsigned char)*s;
6380 if (c < 128) {
6381 *p++ = c;
6382 ++s;
6383 }
6384 else {
6385 startinpos = s-starts;
6386 endinpos = startinpos + 1;
6387 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6388 if (unicode_decode_call_errorhandler(
6389 errors, &errorHandler,
6390 "ascii", "ordinal not in range(128)",
6391 &starts, &e, &startinpos, &endinpos, &exc, &s,
6392 &v, &outpos, &p))
6393 goto onError;
6394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006396 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006397 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006401 if (PyUnicode_READY(v) == -1) {
6402 Py_DECREF(v);
6403 return NULL;
6404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006406
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 Py_XDECREF(errorHandler);
6410 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 return NULL;
6412}
6413
Alexander Belopolsky40018472011-02-26 01:02:56 +00006414PyObject *
6415PyUnicode_EncodeASCII(const Py_UNICODE *p,
6416 Py_ssize_t size,
6417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420}
6421
Alexander Belopolsky40018472011-02-26 01:02:56 +00006422PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006423_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
6425 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 PyErr_BadArgument();
6427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006429 if (PyUnicode_READY(unicode) == -1)
6430 return NULL;
6431 /* Fast path: if it is an ASCII-only string, construct bytes object
6432 directly. Else defer to above function to raise the exception. */
6433 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6434 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6435 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006438 errors);
6439}
6440
6441PyObject *
6442PyUnicode_AsASCIIString(PyObject *unicode)
6443{
6444 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445}
6446
Victor Stinner99b95382011-07-04 14:23:54 +02006447#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006448
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006449/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006450
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006451#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006452#define NEED_RETRY
6453#endif
6454
6455/* XXX This code is limited to "true" double-byte encodings, as
6456 a) it assumes an incomplete character consists of a single byte, and
6457 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006459
Alexander Belopolsky40018472011-02-26 01:02:56 +00006460static int
6461is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006462{
6463 const char *curr = s + offset;
6464
6465 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 const char *prev = CharPrev(s, curr);
6467 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006468 }
6469 return 0;
6470}
6471
6472/*
6473 * Decode MBCS string into unicode object. If 'final' is set, converts
6474 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6475 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006476static int
6477decode_mbcs(PyUnicodeObject **v,
6478 const char *s, /* MBCS string */
6479 int size, /* sizeof MBCS string */
6480 int final,
6481 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006482{
6483 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006484 Py_ssize_t n;
6485 DWORD usize;
6486 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006487
6488 assert(size >= 0);
6489
Victor Stinner554f3f02010-06-16 23:33:54 +00006490 /* check and handle 'errors' arg */
6491 if (errors==NULL || strcmp(errors, "strict")==0)
6492 flags = MB_ERR_INVALID_CHARS;
6493 else if (strcmp(errors, "ignore")==0)
6494 flags = 0;
6495 else {
6496 PyErr_Format(PyExc_ValueError,
6497 "mbcs encoding does not support errors='%s'",
6498 errors);
6499 return -1;
6500 }
6501
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006502 /* Skip trailing lead-byte unless 'final' is set */
6503 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006505
6506 /* First get the size of the result */
6507 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006508 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6509 if (usize==0)
6510 goto mbcs_decode_error;
6511 } else
6512 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006513
6514 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 /* Create unicode object */
6516 *v = _PyUnicode_New(usize);
6517 if (*v == NULL)
6518 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006519 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006520 }
6521 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 /* Extend unicode object */
6523 n = PyUnicode_GET_SIZE(*v);
Victor Stinnerfe226c02011-10-03 03:52:20 +02006524 if (PyUnicode_Resize(v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006526 }
6527
6528 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006529 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006531 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6532 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006534 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006535 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006536
6537mbcs_decode_error:
6538 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6539 we raise a UnicodeDecodeError - else it is a 'generic'
6540 windows error
6541 */
6542 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6543 /* Ideally, we should get reason from FormatMessage - this
6544 is the Windows 2000 English version of the message
6545 */
6546 PyObject *exc = NULL;
6547 const char *reason = "No mapping for the Unicode character exists "
6548 "in the target multi-byte code page.";
6549 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6550 if (exc != NULL) {
6551 PyCodec_StrictErrors(exc);
6552 Py_DECREF(exc);
6553 }
6554 } else {
6555 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6556 }
6557 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006558}
6559
Alexander Belopolsky40018472011-02-26 01:02:56 +00006560PyObject *
6561PyUnicode_DecodeMBCSStateful(const char *s,
6562 Py_ssize_t size,
6563 const char *errors,
6564 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006565{
6566 PyUnicodeObject *v = NULL;
6567 int done;
6568
6569 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006571
6572#ifdef NEED_RETRY
6573 retry:
6574 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006575 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576 else
6577#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006578 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579
6580 if (done < 0) {
6581 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583 }
6584
6585 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587
6588#ifdef NEED_RETRY
6589 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 s += done;
6591 size -= done;
6592 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006593 }
6594#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006595 if (PyUnicode_READY(v) == -1) {
6596 Py_DECREF(v);
6597 return NULL;
6598 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006599 return (PyObject *)v;
6600}
6601
Alexander Belopolsky40018472011-02-26 01:02:56 +00006602PyObject *
6603PyUnicode_DecodeMBCS(const char *s,
6604 Py_ssize_t size,
6605 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006606{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006607 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6608}
6609
6610/*
6611 * Convert unicode into string object (MBCS).
6612 * Returns 0 if succeed, -1 otherwise.
6613 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006614static int
6615encode_mbcs(PyObject **repr,
6616 const Py_UNICODE *p, /* unicode */
6617 int size, /* size of unicode */
6618 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006619{
Victor Stinner554f3f02010-06-16 23:33:54 +00006620 BOOL usedDefaultChar = FALSE;
6621 BOOL *pusedDefaultChar;
6622 int mbcssize;
6623 Py_ssize_t n;
6624 PyObject *exc = NULL;
6625 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626
6627 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006628
Victor Stinner554f3f02010-06-16 23:33:54 +00006629 /* check and handle 'errors' arg */
6630 if (errors==NULL || strcmp(errors, "strict")==0) {
6631 flags = WC_NO_BEST_FIT_CHARS;
6632 pusedDefaultChar = &usedDefaultChar;
6633 } else if (strcmp(errors, "replace")==0) {
6634 flags = 0;
6635 pusedDefaultChar = NULL;
6636 } else {
6637 PyErr_Format(PyExc_ValueError,
6638 "mbcs encoding does not support errors='%s'",
6639 errors);
6640 return -1;
6641 }
6642
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006643 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006644 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006645 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6646 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 if (mbcssize == 0) {
6648 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6649 return -1;
6650 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006651 /* If we used a default char, then we failed! */
6652 if (pusedDefaultChar && *pusedDefaultChar)
6653 goto mbcs_encode_error;
6654 } else {
6655 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006656 }
6657
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006658 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 /* Create string object */
6660 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6661 if (*repr == NULL)
6662 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006663 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006664 }
6665 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 /* Extend string object */
6667 n = PyBytes_Size(*repr);
6668 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6669 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006670 }
6671
6672 /* Do the conversion */
6673 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006675 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6676 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6678 return -1;
6679 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006680 if (pusedDefaultChar && *pusedDefaultChar)
6681 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006682 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006684
6685mbcs_encode_error:
6686 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6687 Py_XDECREF(exc);
6688 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006689}
6690
Alexander Belopolsky40018472011-02-26 01:02:56 +00006691PyObject *
6692PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6693 Py_ssize_t size,
6694 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006695{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006696 PyObject *repr = NULL;
6697 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006698
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006702 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 else
6704#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006705 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006706
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 Py_XDECREF(repr);
6709 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006710 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711
6712#ifdef NEED_RETRY
6713 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 p += INT_MAX;
6715 size -= INT_MAX;
6716 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717 }
6718#endif
6719
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006720 return repr;
6721}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006725{
6726 if (!PyUnicode_Check(unicode)) {
6727 PyErr_BadArgument();
6728 return NULL;
6729 }
6730 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 PyUnicode_GET_SIZE(unicode),
6732 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006733}
6734
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006735#undef NEED_RETRY
6736
Victor Stinner99b95382011-07-04 14:23:54 +02006737#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006738
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739/* --- Character Mapping Codec -------------------------------------------- */
6740
Alexander Belopolsky40018472011-02-26 01:02:56 +00006741PyObject *
6742PyUnicode_DecodeCharmap(const char *s,
6743 Py_ssize_t size,
6744 PyObject *mapping,
6745 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006748 Py_ssize_t startinpos;
6749 Py_ssize_t endinpos;
6750 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 PyUnicodeObject *v;
6753 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006754 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755 PyObject *errorHandler = NULL;
6756 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006757 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006758 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 /* Default to Latin-1 */
6761 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763
6764 v = _PyUnicode_New(size);
6765 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006771 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 mapstring = PyUnicode_AS_UNICODE(mapping);
6773 maplen = PyUnicode_GET_SIZE(mapping);
6774 while (s < e) {
6775 unsigned char ch = *s;
6776 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 if (ch < maplen)
6779 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 if (x == 0xfffe) {
6782 /* undefined mapping */
6783 outpos = p-PyUnicode_AS_UNICODE(v);
6784 startinpos = s-starts;
6785 endinpos = startinpos+1;
6786 if (unicode_decode_call_errorhandler(
6787 errors, &errorHandler,
6788 "charmap", "character maps to <undefined>",
6789 &starts, &e, &startinpos, &endinpos, &exc, &s,
6790 &v, &outpos, &p)) {
6791 goto onError;
6792 }
6793 continue;
6794 }
6795 *p++ = x;
6796 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006797 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006798 }
6799 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 while (s < e) {
6801 unsigned char ch = *s;
6802 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006803
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6805 w = PyLong_FromLong((long)ch);
6806 if (w == NULL)
6807 goto onError;
6808 x = PyObject_GetItem(mapping, w);
6809 Py_DECREF(w);
6810 if (x == NULL) {
6811 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6812 /* No mapping found means: mapping is undefined. */
6813 PyErr_Clear();
6814 x = Py_None;
6815 Py_INCREF(x);
6816 } else
6817 goto onError;
6818 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006819
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 /* Apply mapping */
6821 if (PyLong_Check(x)) {
6822 long value = PyLong_AS_LONG(x);
6823 if (value < 0 || value > 65535) {
6824 PyErr_SetString(PyExc_TypeError,
6825 "character mapping must be in range(65536)");
6826 Py_DECREF(x);
6827 goto onError;
6828 }
6829 *p++ = (Py_UNICODE)value;
6830 }
6831 else if (x == Py_None) {
6832 /* undefined mapping */
6833 outpos = p-PyUnicode_AS_UNICODE(v);
6834 startinpos = s-starts;
6835 endinpos = startinpos+1;
6836 if (unicode_decode_call_errorhandler(
6837 errors, &errorHandler,
6838 "charmap", "character maps to <undefined>",
6839 &starts, &e, &startinpos, &endinpos, &exc, &s,
6840 &v, &outpos, &p)) {
6841 Py_DECREF(x);
6842 goto onError;
6843 }
6844 Py_DECREF(x);
6845 continue;
6846 }
6847 else if (PyUnicode_Check(x)) {
6848 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006849
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 if (targetsize == 1)
6851 /* 1-1 mapping */
6852 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006853
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 else if (targetsize > 1) {
6855 /* 1-n mapping */
6856 if (targetsize > extrachars) {
6857 /* resize first */
6858 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6859 Py_ssize_t needed = (targetsize - extrachars) + \
6860 (targetsize << 2);
6861 extrachars += needed;
6862 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006863 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 PyUnicode_GET_SIZE(v) + needed) < 0) {
6865 Py_DECREF(x);
6866 goto onError;
6867 }
6868 p = PyUnicode_AS_UNICODE(v) + oldpos;
6869 }
6870 Py_UNICODE_COPY(p,
6871 PyUnicode_AS_UNICODE(x),
6872 targetsize);
6873 p += targetsize;
6874 extrachars -= targetsize;
6875 }
6876 /* 1-0 mapping: skip the character */
6877 }
6878 else {
6879 /* wrong return value */
6880 PyErr_SetString(PyExc_TypeError,
6881 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006882 Py_DECREF(x);
6883 goto onError;
6884 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 Py_DECREF(x);
6886 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006887 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 }
6889 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006890 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006892 Py_XDECREF(errorHandler);
6893 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006894 if (PyUnicode_READY(v) == -1) {
6895 Py_DECREF(v);
6896 return NULL;
6897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006899
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901 Py_XDECREF(errorHandler);
6902 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 Py_XDECREF(v);
6904 return NULL;
6905}
6906
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006907/* Charmap encoding: the lookup table */
6908
Alexander Belopolsky40018472011-02-26 01:02:56 +00006909struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 PyObject_HEAD
6911 unsigned char level1[32];
6912 int count2, count3;
6913 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006914};
6915
6916static PyObject*
6917encoding_map_size(PyObject *obj, PyObject* args)
6918{
6919 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006922}
6923
6924static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006925 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 PyDoc_STR("Return the size (in bytes) of this object") },
6927 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006928};
6929
6930static void
6931encoding_map_dealloc(PyObject* o)
6932{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006933 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006934}
6935
6936static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006937 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 "EncodingMap", /*tp_name*/
6939 sizeof(struct encoding_map), /*tp_basicsize*/
6940 0, /*tp_itemsize*/
6941 /* methods */
6942 encoding_map_dealloc, /*tp_dealloc*/
6943 0, /*tp_print*/
6944 0, /*tp_getattr*/
6945 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006946 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 0, /*tp_repr*/
6948 0, /*tp_as_number*/
6949 0, /*tp_as_sequence*/
6950 0, /*tp_as_mapping*/
6951 0, /*tp_hash*/
6952 0, /*tp_call*/
6953 0, /*tp_str*/
6954 0, /*tp_getattro*/
6955 0, /*tp_setattro*/
6956 0, /*tp_as_buffer*/
6957 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6958 0, /*tp_doc*/
6959 0, /*tp_traverse*/
6960 0, /*tp_clear*/
6961 0, /*tp_richcompare*/
6962 0, /*tp_weaklistoffset*/
6963 0, /*tp_iter*/
6964 0, /*tp_iternext*/
6965 encoding_map_methods, /*tp_methods*/
6966 0, /*tp_members*/
6967 0, /*tp_getset*/
6968 0, /*tp_base*/
6969 0, /*tp_dict*/
6970 0, /*tp_descr_get*/
6971 0, /*tp_descr_set*/
6972 0, /*tp_dictoffset*/
6973 0, /*tp_init*/
6974 0, /*tp_alloc*/
6975 0, /*tp_new*/
6976 0, /*tp_free*/
6977 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006978};
6979
6980PyObject*
6981PyUnicode_BuildEncodingMap(PyObject* string)
6982{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006983 PyObject *result;
6984 struct encoding_map *mresult;
6985 int i;
6986 int need_dict = 0;
6987 unsigned char level1[32];
6988 unsigned char level2[512];
6989 unsigned char *mlevel1, *mlevel2, *mlevel3;
6990 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006991 int kind;
6992 void *data;
6993 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006995 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006996 PyErr_BadArgument();
6997 return NULL;
6998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006999 kind = PyUnicode_KIND(string);
7000 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007001 memset(level1, 0xFF, sizeof level1);
7002 memset(level2, 0xFF, sizeof level2);
7003
7004 /* If there isn't a one-to-one mapping of NULL to \0,
7005 or if there are non-BMP characters, we need to use
7006 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007007 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007008 need_dict = 1;
7009 for (i = 1; i < 256; i++) {
7010 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007011 ch = PyUnicode_READ(kind, data, i);
7012 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007013 need_dict = 1;
7014 break;
7015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007016 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007017 /* unmapped character */
7018 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019 l1 = ch >> 11;
7020 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007021 if (level1[l1] == 0xFF)
7022 level1[l1] = count2++;
7023 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007025 }
7026
7027 if (count2 >= 0xFF || count3 >= 0xFF)
7028 need_dict = 1;
7029
7030 if (need_dict) {
7031 PyObject *result = PyDict_New();
7032 PyObject *key, *value;
7033 if (!result)
7034 return NULL;
7035 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007036 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007037 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007038 if (!key || !value)
7039 goto failed1;
7040 if (PyDict_SetItem(result, key, value) == -1)
7041 goto failed1;
7042 Py_DECREF(key);
7043 Py_DECREF(value);
7044 }
7045 return result;
7046 failed1:
7047 Py_XDECREF(key);
7048 Py_XDECREF(value);
7049 Py_DECREF(result);
7050 return NULL;
7051 }
7052
7053 /* Create a three-level trie */
7054 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7055 16*count2 + 128*count3 - 1);
7056 if (!result)
7057 return PyErr_NoMemory();
7058 PyObject_Init(result, &EncodingMapType);
7059 mresult = (struct encoding_map*)result;
7060 mresult->count2 = count2;
7061 mresult->count3 = count3;
7062 mlevel1 = mresult->level1;
7063 mlevel2 = mresult->level23;
7064 mlevel3 = mresult->level23 + 16*count2;
7065 memcpy(mlevel1, level1, 32);
7066 memset(mlevel2, 0xFF, 16*count2);
7067 memset(mlevel3, 0, 128*count3);
7068 count3 = 0;
7069 for (i = 1; i < 256; i++) {
7070 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007071 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007072 /* unmapped character */
7073 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007074 o1 = PyUnicode_READ(kind, data, i)>>11;
7075 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007076 i2 = 16*mlevel1[o1] + o2;
7077 if (mlevel2[i2] == 0xFF)
7078 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007079 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007080 i3 = 128*mlevel2[i2] + o3;
7081 mlevel3[i3] = i;
7082 }
7083 return result;
7084}
7085
7086static int
7087encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7088{
7089 struct encoding_map *map = (struct encoding_map*)mapping;
7090 int l1 = c>>11;
7091 int l2 = (c>>7) & 0xF;
7092 int l3 = c & 0x7F;
7093 int i;
7094
7095#ifdef Py_UNICODE_WIDE
7096 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007098 }
7099#endif
7100 if (c == 0)
7101 return 0;
7102 /* level 1*/
7103 i = map->level1[l1];
7104 if (i == 0xFF) {
7105 return -1;
7106 }
7107 /* level 2*/
7108 i = map->level23[16*i+l2];
7109 if (i == 0xFF) {
7110 return -1;
7111 }
7112 /* level 3 */
7113 i = map->level23[16*map->count2 + 128*i + l3];
7114 if (i == 0) {
7115 return -1;
7116 }
7117 return i;
7118}
7119
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120/* Lookup the character ch in the mapping. If the character
7121 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007122 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007123static PyObject *
7124charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125{
Christian Heimes217cfd12007-12-02 14:31:20 +00007126 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007127 PyObject *x;
7128
7129 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007131 x = PyObject_GetItem(mapping, w);
7132 Py_DECREF(w);
7133 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7135 /* No mapping found means: mapping is undefined. */
7136 PyErr_Clear();
7137 x = Py_None;
7138 Py_INCREF(x);
7139 return x;
7140 } else
7141 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007143 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007145 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 long value = PyLong_AS_LONG(x);
7147 if (value < 0 || value > 255) {
7148 PyErr_SetString(PyExc_TypeError,
7149 "character mapping must be in range(256)");
7150 Py_DECREF(x);
7151 return NULL;
7152 }
7153 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007155 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 /* wrong return value */
7159 PyErr_Format(PyExc_TypeError,
7160 "character mapping must return integer, bytes or None, not %.400s",
7161 x->ob_type->tp_name);
7162 Py_DECREF(x);
7163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 }
7165}
7166
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007167static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007168charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007169{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007170 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7171 /* exponentially overallocate to minimize reallocations */
7172 if (requiredsize < 2*outsize)
7173 requiredsize = 2*outsize;
7174 if (_PyBytes_Resize(outobj, requiredsize))
7175 return -1;
7176 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007177}
7178
Benjamin Peterson14339b62009-01-31 16:36:08 +00007179typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007181} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007182/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007183 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007184 space is available. Return a new reference to the object that
7185 was put in the output buffer, or Py_None, if the mapping was undefined
7186 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007187 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007188static charmapencode_result
7189charmapencode_output(Py_UNICODE c, PyObject *mapping,
7190 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007192 PyObject *rep;
7193 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007194 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195
Christian Heimes90aa7642007-12-19 02:45:37 +00007196 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007197 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007199 if (res == -1)
7200 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 if (outsize<requiredsize)
7202 if (charmapencode_resize(outobj, outpos, requiredsize))
7203 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007204 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 outstart[(*outpos)++] = (char)res;
7206 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007207 }
7208
7209 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007212 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 Py_DECREF(rep);
7214 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007215 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 if (PyLong_Check(rep)) {
7217 Py_ssize_t requiredsize = *outpos+1;
7218 if (outsize<requiredsize)
7219 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7220 Py_DECREF(rep);
7221 return enc_EXCEPTION;
7222 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007223 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 else {
7227 const char *repchars = PyBytes_AS_STRING(rep);
7228 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7229 Py_ssize_t requiredsize = *outpos+repsize;
7230 if (outsize<requiredsize)
7231 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7232 Py_DECREF(rep);
7233 return enc_EXCEPTION;
7234 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007235 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 memcpy(outstart + *outpos, repchars, repsize);
7237 *outpos += repsize;
7238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007239 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007240 Py_DECREF(rep);
7241 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007242}
7243
7244/* handle an error in PyUnicode_EncodeCharmap
7245 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007246static int
7247charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007248 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007250 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007251 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252{
7253 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007254 Py_ssize_t repsize;
7255 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256 Py_UNICODE *uni2;
7257 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 Py_ssize_t collstartpos = *inpos;
7259 Py_ssize_t collendpos = *inpos+1;
7260 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261 char *encoding = "charmap";
7262 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007263 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265 /* find all unencodable characters */
7266 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007268 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 int res = encoding_map_lookup(p[collendpos], mapping);
7270 if (res != -1)
7271 break;
7272 ++collendpos;
7273 continue;
7274 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007275
Benjamin Peterson29060642009-01-31 22:14:21 +00007276 rep = charmapencode_lookup(p[collendpos], mapping);
7277 if (rep==NULL)
7278 return -1;
7279 else if (rep!=Py_None) {
7280 Py_DECREF(rep);
7281 break;
7282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007283 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285 }
7286 /* cache callback name lookup
7287 * (if not done yet, i.e. it's the first error) */
7288 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007289 if ((errors==NULL) || (!strcmp(errors, "strict")))
7290 *known_errorHandler = 1;
7291 else if (!strcmp(errors, "replace"))
7292 *known_errorHandler = 2;
7293 else if (!strcmp(errors, "ignore"))
7294 *known_errorHandler = 3;
7295 else if (!strcmp(errors, "xmlcharrefreplace"))
7296 *known_errorHandler = 4;
7297 else
7298 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007299 }
7300 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007301 case 1: /* strict */
7302 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7303 return -1;
7304 case 2: /* replace */
7305 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 x = charmapencode_output('?', mapping, res, respos);
7307 if (x==enc_EXCEPTION) {
7308 return -1;
7309 }
7310 else if (x==enc_FAILED) {
7311 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7312 return -1;
7313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007314 }
7315 /* fall through */
7316 case 3: /* ignore */
7317 *inpos = collendpos;
7318 break;
7319 case 4: /* xmlcharrefreplace */
7320 /* generate replacement (temporarily (mis)uses p) */
7321 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 char buffer[2+29+1+1];
7323 char *cp;
7324 sprintf(buffer, "&#%d;", (int)p[collpos]);
7325 for (cp = buffer; *cp; ++cp) {
7326 x = charmapencode_output(*cp, mapping, res, respos);
7327 if (x==enc_EXCEPTION)
7328 return -1;
7329 else if (x==enc_FAILED) {
7330 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7331 return -1;
7332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007333 }
7334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007335 *inpos = collendpos;
7336 break;
7337 default:
7338 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 encoding, reason, p, size, exceptionObject,
7340 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007341 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007343 if (PyBytes_Check(repunicode)) {
7344 /* Directly copy bytes result to output. */
7345 Py_ssize_t outsize = PyBytes_Size(*res);
7346 Py_ssize_t requiredsize;
7347 repsize = PyBytes_Size(repunicode);
7348 requiredsize = *respos + repsize;
7349 if (requiredsize > outsize)
7350 /* Make room for all additional bytes. */
7351 if (charmapencode_resize(res, respos, requiredsize)) {
7352 Py_DECREF(repunicode);
7353 return -1;
7354 }
7355 memcpy(PyBytes_AsString(*res) + *respos,
7356 PyBytes_AsString(repunicode), repsize);
7357 *respos += repsize;
7358 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007359 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007360 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007362 /* generate replacement */
7363 repsize = PyUnicode_GET_SIZE(repunicode);
7364 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 x = charmapencode_output(*uni2, mapping, res, respos);
7366 if (x==enc_EXCEPTION) {
7367 return -1;
7368 }
7369 else if (x==enc_FAILED) {
7370 Py_DECREF(repunicode);
7371 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7372 return -1;
7373 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007374 }
7375 *inpos = newpos;
7376 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007377 }
7378 return 0;
7379}
7380
Alexander Belopolsky40018472011-02-26 01:02:56 +00007381PyObject *
7382PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7383 Py_ssize_t size,
7384 PyObject *mapping,
7385 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007387 /* output object */
7388 PyObject *res = NULL;
7389 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007391 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007392 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 PyObject *errorHandler = NULL;
7394 PyObject *exc = NULL;
7395 /* the following variable is used for caching string comparisons
7396 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7397 * 3=ignore, 4=xmlcharrefreplace */
7398 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399
7400 /* Default to Latin-1 */
7401 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007404 /* allocate enough for a simple encoding without
7405 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007406 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407 if (res == NULL)
7408 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007409 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007412 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 /* try to encode it */
7414 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7415 if (x==enc_EXCEPTION) /* error */
7416 goto onError;
7417 if (x==enc_FAILED) { /* unencodable character */
7418 if (charmap_encoding_error(p, size, &inpos, mapping,
7419 &exc,
7420 &known_errorHandler, &errorHandler, errors,
7421 &res, &respos)) {
7422 goto onError;
7423 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007424 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 else
7426 /* done with this character => adjust input position */
7427 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007430 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007431 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007432 if (_PyBytes_Resize(&res, respos) < 0)
7433 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007435 Py_XDECREF(exc);
7436 Py_XDECREF(errorHandler);
7437 return res;
7438
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007440 Py_XDECREF(res);
7441 Py_XDECREF(exc);
7442 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 return NULL;
7444}
7445
Alexander Belopolsky40018472011-02-26 01:02:56 +00007446PyObject *
7447PyUnicode_AsCharmapString(PyObject *unicode,
7448 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449{
7450 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 PyErr_BadArgument();
7452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 }
7454 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 PyUnicode_GET_SIZE(unicode),
7456 mapping,
7457 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458}
7459
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007460/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007461static void
7462make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007463 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007464 Py_ssize_t startpos, Py_ssize_t endpos,
7465 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007467 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007468 *exceptionObject = _PyUnicodeTranslateError_Create(
7469 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 }
7471 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7473 goto onError;
7474 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7475 goto onError;
7476 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7477 goto onError;
7478 return;
7479 onError:
7480 Py_DECREF(*exceptionObject);
7481 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 }
7483}
7484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007486static void
7487raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007489 Py_ssize_t startpos, Py_ssize_t endpos,
7490 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007491{
7492 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007493 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007494 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007496}
7497
7498/* error handling callback helper:
7499 build arguments, call the callback and check the arguments,
7500 put the result into newpos and return the replacement string, which
7501 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007502static PyObject *
7503unicode_translate_call_errorhandler(const char *errors,
7504 PyObject **errorHandler,
7505 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007506 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007507 Py_ssize_t startpos, Py_ssize_t endpos,
7508 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007509{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007510 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007511
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007512 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007513 PyObject *restuple;
7514 PyObject *resunicode;
7515
7516 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007518 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007520 }
7521
7522 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007523 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007526
7527 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007529 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007531 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007532 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 Py_DECREF(restuple);
7534 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007535 }
7536 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 &resunicode, &i_newpos)) {
7538 Py_DECREF(restuple);
7539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007542 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007543 else
7544 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007545 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7547 Py_DECREF(restuple);
7548 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007549 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 Py_INCREF(resunicode);
7551 Py_DECREF(restuple);
7552 return resunicode;
7553}
7554
7555/* Lookup the character ch in the mapping and put the result in result,
7556 which must be decrefed by the caller.
7557 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007558static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007559charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007560{
Christian Heimes217cfd12007-12-02 14:31:20 +00007561 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007562 PyObject *x;
7563
7564 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007566 x = PyObject_GetItem(mapping, w);
7567 Py_DECREF(w);
7568 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7570 /* No mapping found means: use 1:1 mapping. */
7571 PyErr_Clear();
7572 *result = NULL;
7573 return 0;
7574 } else
7575 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576 }
7577 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 *result = x;
7579 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007580 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007581 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 long value = PyLong_AS_LONG(x);
7583 long max = PyUnicode_GetMax();
7584 if (value < 0 || value > max) {
7585 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007586 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 Py_DECREF(x);
7588 return -1;
7589 }
7590 *result = x;
7591 return 0;
7592 }
7593 else if (PyUnicode_Check(x)) {
7594 *result = x;
7595 return 0;
7596 }
7597 else {
7598 /* wrong return value */
7599 PyErr_SetString(PyExc_TypeError,
7600 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007601 Py_DECREF(x);
7602 return -1;
7603 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007604}
7605/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 if not reallocate and adjust various state variables.
7607 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007608static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007609charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007613 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 /* exponentially overallocate to minimize reallocations */
7615 if (requiredsize < 2 * oldsize)
7616 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007617 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7618 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007620 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007621 }
7622 return 0;
7623}
7624/* lookup the character, put the result in the output string and adjust
7625 various state variables. Return a new reference to the object that
7626 was put in the output buffer in *result, or Py_None, if the mapping was
7627 undefined (in which case no character was written).
7628 The called must decref result.
7629 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007630static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7632 PyObject *mapping, Py_UCS4 **output,
7633 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007634 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007636 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7637 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007641 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642 }
7643 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007645 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007647 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 }
7649 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007650 Py_ssize_t repsize;
7651 if (PyUnicode_READY(*res) == -1)
7652 return -1;
7653 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 if (repsize==1) {
7655 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007656 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 }
7658 else if (repsize!=0) {
7659 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007660 Py_ssize_t requiredsize = *opos +
7661 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007663 Py_ssize_t i;
7664 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 for(i = 0; i < repsize; i++)
7667 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007669 }
7670 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672 return 0;
7673}
7674
Alexander Belopolsky40018472011-02-26 01:02:56 +00007675PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007676_PyUnicode_TranslateCharmap(PyObject *input,
7677 PyObject *mapping,
7678 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680 /* input object */
7681 char *idata;
7682 Py_ssize_t size, i;
7683 int kind;
7684 /* output buffer */
7685 Py_UCS4 *output = NULL;
7686 Py_ssize_t osize;
7687 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007689 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007690 char *reason = "character maps to <undefined>";
7691 PyObject *errorHandler = NULL;
7692 PyObject *exc = NULL;
7693 /* the following variable is used for caching string comparisons
7694 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7695 * 3=ignore, 4=xmlcharrefreplace */
7696 int known_errorHandler = -1;
7697
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 PyErr_BadArgument();
7700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 if (PyUnicode_READY(input) == -1)
7704 return NULL;
7705 idata = (char*)PyUnicode_DATA(input);
7706 kind = PyUnicode_KIND(input);
7707 size = PyUnicode_GET_LENGTH(input);
7708 i = 0;
7709
7710 if (size == 0) {
7711 Py_INCREF(input);
7712 return input;
7713 }
7714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007715 /* allocate enough for a simple 1:1 translation without
7716 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717 osize = size;
7718 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7719 opos = 0;
7720 if (output == NULL) {
7721 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007725 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 /* try to encode it */
7727 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 if (charmaptranslate_output(input, i, mapping,
7729 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 Py_XDECREF(x);
7731 goto onError;
7732 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 else { /* untranslatable character */
7737 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7738 Py_ssize_t repsize;
7739 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007740 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 Py_ssize_t collstart = i;
7743 Py_ssize_t collend = i+1;
7744 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 while (collend < size) {
7748 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 goto onError;
7750 Py_XDECREF(x);
7751 if (x!=Py_None)
7752 break;
7753 ++collend;
7754 }
7755 /* cache callback name lookup
7756 * (if not done yet, i.e. it's the first error) */
7757 if (known_errorHandler==-1) {
7758 if ((errors==NULL) || (!strcmp(errors, "strict")))
7759 known_errorHandler = 1;
7760 else if (!strcmp(errors, "replace"))
7761 known_errorHandler = 2;
7762 else if (!strcmp(errors, "ignore"))
7763 known_errorHandler = 3;
7764 else if (!strcmp(errors, "xmlcharrefreplace"))
7765 known_errorHandler = 4;
7766 else
7767 known_errorHandler = 0;
7768 }
7769 switch (known_errorHandler) {
7770 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 raise_translate_exception(&exc, input, collstart,
7772 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 case 2: /* replace */
7775 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007776 for (coll = collstart; coll<collend; coll++)
7777 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 /* fall through */
7779 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007780 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 break;
7782 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007783 /* generate replacement (temporarily (mis)uses i) */
7784 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 char buffer[2+29+1+1];
7786 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7788 if (charmaptranslate_makespace(&output, &osize,
7789 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 goto onError;
7791 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007792 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 break;
7796 default:
7797 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798 reason, input, &exc,
7799 collstart, collend, &newpos);
7800 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 goto onError;
7802 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 repsize = PyUnicode_GET_LENGTH(repunicode);
7804 if (charmaptranslate_makespace(&output, &osize,
7805 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 Py_DECREF(repunicode);
7807 goto onError;
7808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 for (uni2 = 0; repsize-->0; ++uni2)
7810 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7811 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007814 }
7815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7817 if (!res)
7818 goto onError;
7819 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820 Py_XDECREF(exc);
7821 Py_XDECREF(errorHandler);
7822 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007825 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 Py_XDECREF(exc);
7827 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828 return NULL;
7829}
7830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007831/* Deprecated. Use PyUnicode_Translate instead. */
7832PyObject *
7833PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7834 Py_ssize_t size,
7835 PyObject *mapping,
7836 const char *errors)
7837{
7838 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7839 if (!unicode)
7840 return NULL;
7841 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7842}
7843
Alexander Belopolsky40018472011-02-26 01:02:56 +00007844PyObject *
7845PyUnicode_Translate(PyObject *str,
7846 PyObject *mapping,
7847 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848{
7849 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007850
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 str = PyUnicode_FromObject(str);
7852 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007854 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 Py_DECREF(str);
7856 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007857
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 Py_XDECREF(str);
7860 return NULL;
7861}
Tim Petersced69f82003-09-16 20:30:58 +00007862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863static Py_UCS4
7864fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7865{
7866 /* No need to call PyUnicode_READY(self) because this function is only
7867 called as a callback from fixup() which does it already. */
7868 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7869 const int kind = PyUnicode_KIND(self);
7870 void *data = PyUnicode_DATA(self);
7871 Py_UCS4 maxchar = 0, ch, fixed;
7872 Py_ssize_t i;
7873
7874 for (i = 0; i < len; ++i) {
7875 ch = PyUnicode_READ(kind, data, i);
7876 fixed = 0;
7877 if (ch > 127) {
7878 if (Py_UNICODE_ISSPACE(ch))
7879 fixed = ' ';
7880 else {
7881 const int decimal = Py_UNICODE_TODECIMAL(ch);
7882 if (decimal >= 0)
7883 fixed = '0' + decimal;
7884 }
7885 if (fixed != 0) {
7886 if (fixed > maxchar)
7887 maxchar = fixed;
7888 PyUnicode_WRITE(kind, data, i, fixed);
7889 }
7890 else if (ch > maxchar)
7891 maxchar = ch;
7892 }
7893 else if (ch > maxchar)
7894 maxchar = ch;
7895 }
7896
7897 return maxchar;
7898}
7899
7900PyObject *
7901_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7902{
7903 if (!PyUnicode_Check(unicode)) {
7904 PyErr_BadInternalCall();
7905 return NULL;
7906 }
7907 if (PyUnicode_READY(unicode) == -1)
7908 return NULL;
7909 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7910 /* If the string is already ASCII, just return the same string */
7911 Py_INCREF(unicode);
7912 return unicode;
7913 }
7914 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7915}
7916
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007917PyObject *
7918PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7919 Py_ssize_t length)
7920{
7921 PyObject *result;
7922 Py_UNICODE *p; /* write pointer into result */
7923 Py_ssize_t i;
7924 /* Copy to a new string */
7925 result = (PyObject *)_PyUnicode_New(length);
7926 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7927 if (result == NULL)
7928 return result;
7929 p = PyUnicode_AS_UNICODE(result);
7930 /* Iterate over code points */
7931 for (i = 0; i < length; i++) {
7932 Py_UNICODE ch =s[i];
7933 if (ch > 127) {
7934 int decimal = Py_UNICODE_TODECIMAL(ch);
7935 if (decimal >= 0)
7936 p[i] = '0' + decimal;
7937 }
7938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7940 Py_DECREF(result);
7941 return NULL;
7942 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007943 return result;
7944}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007945/* --- Decimal Encoder ---------------------------------------------------- */
7946
Alexander Belopolsky40018472011-02-26 01:02:56 +00007947int
7948PyUnicode_EncodeDecimal(Py_UNICODE *s,
7949 Py_ssize_t length,
7950 char *output,
7951 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007952{
7953 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 PyObject *errorHandler = NULL;
7955 PyObject *exc = NULL;
7956 const char *encoding = "decimal";
7957 const char *reason = "invalid decimal Unicode string";
7958 /* the following variable is used for caching string comparisons
7959 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7960 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007961
7962 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 PyErr_BadArgument();
7964 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007965 }
7966
7967 p = s;
7968 end = s + length;
7969 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 register Py_UNICODE ch = *p;
7971 int decimal;
7972 PyObject *repunicode;
7973 Py_ssize_t repsize;
7974 Py_ssize_t newpos;
7975 Py_UNICODE *uni2;
7976 Py_UNICODE *collstart;
7977 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007978
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007980 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 ++p;
7982 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007983 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 decimal = Py_UNICODE_TODECIMAL(ch);
7985 if (decimal >= 0) {
7986 *output++ = '0' + decimal;
7987 ++p;
7988 continue;
7989 }
7990 if (0 < ch && ch < 256) {
7991 *output++ = (char)ch;
7992 ++p;
7993 continue;
7994 }
7995 /* All other characters are considered unencodable */
7996 collstart = p;
7997 collend = p+1;
7998 while (collend < end) {
7999 if ((0 < *collend && *collend < 256) ||
8000 !Py_UNICODE_ISSPACE(*collend) ||
8001 Py_UNICODE_TODECIMAL(*collend))
8002 break;
8003 }
8004 /* cache callback name lookup
8005 * (if not done yet, i.e. it's the first error) */
8006 if (known_errorHandler==-1) {
8007 if ((errors==NULL) || (!strcmp(errors, "strict")))
8008 known_errorHandler = 1;
8009 else if (!strcmp(errors, "replace"))
8010 known_errorHandler = 2;
8011 else if (!strcmp(errors, "ignore"))
8012 known_errorHandler = 3;
8013 else if (!strcmp(errors, "xmlcharrefreplace"))
8014 known_errorHandler = 4;
8015 else
8016 known_errorHandler = 0;
8017 }
8018 switch (known_errorHandler) {
8019 case 1: /* strict */
8020 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8021 goto onError;
8022 case 2: /* replace */
8023 for (p = collstart; p < collend; ++p)
8024 *output++ = '?';
8025 /* fall through */
8026 case 3: /* ignore */
8027 p = collend;
8028 break;
8029 case 4: /* xmlcharrefreplace */
8030 /* generate replacement (temporarily (mis)uses p) */
8031 for (p = collstart; p < collend; ++p)
8032 output += sprintf(output, "&#%d;", (int)*p);
8033 p = collend;
8034 break;
8035 default:
8036 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8037 encoding, reason, s, length, &exc,
8038 collstart-s, collend-s, &newpos);
8039 if (repunicode == NULL)
8040 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008041 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008042 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008043 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8044 Py_DECREF(repunicode);
8045 goto onError;
8046 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 /* generate replacement */
8048 repsize = PyUnicode_GET_SIZE(repunicode);
8049 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8050 Py_UNICODE ch = *uni2;
8051 if (Py_UNICODE_ISSPACE(ch))
8052 *output++ = ' ';
8053 else {
8054 decimal = Py_UNICODE_TODECIMAL(ch);
8055 if (decimal >= 0)
8056 *output++ = '0' + decimal;
8057 else if (0 < ch && ch < 256)
8058 *output++ = (char)ch;
8059 else {
8060 Py_DECREF(repunicode);
8061 raise_encode_exception(&exc, encoding,
8062 s, length, collstart-s, collend-s, reason);
8063 goto onError;
8064 }
8065 }
8066 }
8067 p = s + newpos;
8068 Py_DECREF(repunicode);
8069 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008070 }
8071 /* 0-terminate the output string */
8072 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073 Py_XDECREF(exc);
8074 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008075 return 0;
8076
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 Py_XDECREF(exc);
8079 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008080 return -1;
8081}
8082
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083/* --- Helpers ------------------------------------------------------------ */
8084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085#include "stringlib/ucs1lib.h"
8086#include "stringlib/fastsearch.h"
8087#include "stringlib/partition.h"
8088#include "stringlib/split.h"
8089#include "stringlib/count.h"
8090#include "stringlib/find.h"
8091#include "stringlib/localeutil.h"
8092#include "stringlib/undef.h"
8093
8094#include "stringlib/ucs2lib.h"
8095#include "stringlib/fastsearch.h"
8096#include "stringlib/partition.h"
8097#include "stringlib/split.h"
8098#include "stringlib/count.h"
8099#include "stringlib/find.h"
8100#include "stringlib/localeutil.h"
8101#include "stringlib/undef.h"
8102
8103#include "stringlib/ucs4lib.h"
8104#include "stringlib/fastsearch.h"
8105#include "stringlib/partition.h"
8106#include "stringlib/split.h"
8107#include "stringlib/count.h"
8108#include "stringlib/find.h"
8109#include "stringlib/localeutil.h"
8110#include "stringlib/undef.h"
8111
8112static Py_ssize_t
8113any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8114 const Py_UCS1*, Py_ssize_t,
8115 Py_ssize_t, Py_ssize_t),
8116 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8117 const Py_UCS2*, Py_ssize_t,
8118 Py_ssize_t, Py_ssize_t),
8119 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8120 const Py_UCS4*, Py_ssize_t,
8121 Py_ssize_t, Py_ssize_t),
8122 PyObject* s1, PyObject* s2,
8123 Py_ssize_t start,
8124 Py_ssize_t end)
8125{
8126 int kind1, kind2, kind;
8127 void *buf1, *buf2;
8128 Py_ssize_t len1, len2, result;
8129
8130 kind1 = PyUnicode_KIND(s1);
8131 kind2 = PyUnicode_KIND(s2);
8132 kind = kind1 > kind2 ? kind1 : kind2;
8133 buf1 = PyUnicode_DATA(s1);
8134 buf2 = PyUnicode_DATA(s2);
8135 if (kind1 != kind)
8136 buf1 = _PyUnicode_AsKind(s1, kind);
8137 if (!buf1)
8138 return -2;
8139 if (kind2 != kind)
8140 buf2 = _PyUnicode_AsKind(s2, kind);
8141 if (!buf2) {
8142 if (kind1 != kind) PyMem_Free(buf1);
8143 return -2;
8144 }
8145 len1 = PyUnicode_GET_LENGTH(s1);
8146 len2 = PyUnicode_GET_LENGTH(s2);
8147
8148 switch(kind) {
8149 case PyUnicode_1BYTE_KIND:
8150 result = ucs1(buf1, len1, buf2, len2, start, end);
8151 break;
8152 case PyUnicode_2BYTE_KIND:
8153 result = ucs2(buf1, len1, buf2, len2, start, end);
8154 break;
8155 case PyUnicode_4BYTE_KIND:
8156 result = ucs4(buf1, len1, buf2, len2, start, end);
8157 break;
8158 default:
8159 assert(0); result = -2;
8160 }
8161
8162 if (kind1 != kind)
8163 PyMem_Free(buf1);
8164 if (kind2 != kind)
8165 PyMem_Free(buf2);
8166
8167 return result;
8168}
8169
8170Py_ssize_t
8171_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8172 Py_ssize_t n_buffer,
8173 void *digits, Py_ssize_t n_digits,
8174 Py_ssize_t min_width,
8175 const char *grouping,
8176 const char *thousands_sep)
8177{
8178 switch(kind) {
8179 case PyUnicode_1BYTE_KIND:
8180 return _PyUnicode_ucs1_InsertThousandsGrouping(
8181 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8182 min_width, grouping, thousands_sep);
8183 case PyUnicode_2BYTE_KIND:
8184 return _PyUnicode_ucs2_InsertThousandsGrouping(
8185 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8186 min_width, grouping, thousands_sep);
8187 case PyUnicode_4BYTE_KIND:
8188 return _PyUnicode_ucs4_InsertThousandsGrouping(
8189 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8190 min_width, grouping, thousands_sep);
8191 }
8192 assert(0);
8193 return -1;
8194}
8195
8196
Eric Smith8c663262007-08-25 02:26:07 +00008197#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008199
Thomas Wouters477c8d52006-05-27 19:21:47 +00008200#include "stringlib/count.h"
8201#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008202
Thomas Wouters477c8d52006-05-27 19:21:47 +00008203/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008204#define ADJUST_INDICES(start, end, len) \
8205 if (end > len) \
8206 end = len; \
8207 else if (end < 0) { \
8208 end += len; \
8209 if (end < 0) \
8210 end = 0; \
8211 } \
8212 if (start < 0) { \
8213 start += len; \
8214 if (start < 0) \
8215 start = 0; \
8216 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008217
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218Py_ssize_t
8219PyUnicode_Count(PyObject *str,
8220 PyObject *substr,
8221 Py_ssize_t start,
8222 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008224 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008225 PyUnicodeObject* str_obj;
8226 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 int kind1, kind2, kind;
8228 void *buf1 = NULL, *buf2 = NULL;
8229 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008230
Thomas Wouters477c8d52006-05-27 19:21:47 +00008231 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008234 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008235 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 Py_DECREF(str_obj);
8237 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 }
Tim Petersced69f82003-09-16 20:30:58 +00008239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 kind1 = PyUnicode_KIND(str_obj);
8241 kind2 = PyUnicode_KIND(sub_obj);
8242 kind = kind1 > kind2 ? kind1 : kind2;
8243 buf1 = PyUnicode_DATA(str_obj);
8244 if (kind1 != kind)
8245 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8246 if (!buf1)
8247 goto onError;
8248 buf2 = PyUnicode_DATA(sub_obj);
8249 if (kind2 != kind)
8250 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8251 if (!buf2)
8252 goto onError;
8253 len1 = PyUnicode_GET_LENGTH(str_obj);
8254 len2 = PyUnicode_GET_LENGTH(sub_obj);
8255
8256 ADJUST_INDICES(start, end, len1);
8257 switch(kind) {
8258 case PyUnicode_1BYTE_KIND:
8259 result = ucs1lib_count(
8260 ((Py_UCS1*)buf1) + start, end - start,
8261 buf2, len2, PY_SSIZE_T_MAX
8262 );
8263 break;
8264 case PyUnicode_2BYTE_KIND:
8265 result = ucs2lib_count(
8266 ((Py_UCS2*)buf1) + start, end - start,
8267 buf2, len2, PY_SSIZE_T_MAX
8268 );
8269 break;
8270 case PyUnicode_4BYTE_KIND:
8271 result = ucs4lib_count(
8272 ((Py_UCS4*)buf1) + start, end - start,
8273 buf2, len2, PY_SSIZE_T_MAX
8274 );
8275 break;
8276 default:
8277 assert(0); result = 0;
8278 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008279
8280 Py_DECREF(sub_obj);
8281 Py_DECREF(str_obj);
8282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 if (kind1 != kind)
8284 PyMem_Free(buf1);
8285 if (kind2 != kind)
8286 PyMem_Free(buf2);
8287
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289 onError:
8290 Py_DECREF(sub_obj);
8291 Py_DECREF(str_obj);
8292 if (kind1 != kind && buf1)
8293 PyMem_Free(buf1);
8294 if (kind2 != kind && buf2)
8295 PyMem_Free(buf2);
8296 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297}
8298
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299Py_ssize_t
8300PyUnicode_Find(PyObject *str,
8301 PyObject *sub,
8302 Py_ssize_t start,
8303 Py_ssize_t end,
8304 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008306 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008307
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008311 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 Py_DECREF(str);
8314 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 }
Tim Petersced69f82003-09-16 20:30:58 +00008316
Thomas Wouters477c8d52006-05-27 19:21:47 +00008317 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 result = any_find_slice(
8319 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8320 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321 );
8322 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 result = any_find_slice(
8324 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8325 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008326 );
8327
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008329 Py_DECREF(sub);
8330
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 return result;
8332}
8333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334Py_ssize_t
8335PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8336 Py_ssize_t start, Py_ssize_t end,
8337 int direction)
8338{
8339 char *result;
8340 int kind;
8341 if (PyUnicode_READY(str) == -1)
8342 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008343 if (start < 0 || end < 0) {
8344 PyErr_SetString(PyExc_IndexError, "string index out of range");
8345 return -2;
8346 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 if (end > PyUnicode_GET_LENGTH(str))
8348 end = PyUnicode_GET_LENGTH(str);
8349 kind = PyUnicode_KIND(str);
8350 result = findchar(PyUnicode_1BYTE_DATA(str)
8351 + PyUnicode_KIND_SIZE(kind, start),
8352 kind,
8353 end-start, ch, direction);
8354 if (!result)
8355 return -1;
8356 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8357}
8358
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359static int
8360tailmatch(PyUnicodeObject *self,
8361 PyUnicodeObject *substring,
8362 Py_ssize_t start,
8363 Py_ssize_t end,
8364 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 int kind_self;
8367 int kind_sub;
8368 void *data_self;
8369 void *data_sub;
8370 Py_ssize_t offset;
8371 Py_ssize_t i;
8372 Py_ssize_t end_sub;
8373
8374 if (PyUnicode_READY(self) == -1 ||
8375 PyUnicode_READY(substring) == -1)
8376 return 0;
8377
8378 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 return 1;
8380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8382 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 kind_self = PyUnicode_KIND(self);
8387 data_self = PyUnicode_DATA(self);
8388 kind_sub = PyUnicode_KIND(substring);
8389 data_sub = PyUnicode_DATA(substring);
8390 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8391
8392 if (direction > 0)
8393 offset = end;
8394 else
8395 offset = start;
8396
8397 if (PyUnicode_READ(kind_self, data_self, offset) ==
8398 PyUnicode_READ(kind_sub, data_sub, 0) &&
8399 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8400 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8401 /* If both are of the same kind, memcmp is sufficient */
8402 if (kind_self == kind_sub) {
8403 return ! memcmp((char *)data_self +
8404 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8405 data_sub,
8406 PyUnicode_GET_LENGTH(substring) *
8407 PyUnicode_CHARACTER_SIZE(substring));
8408 }
8409 /* otherwise we have to compare each character by first accesing it */
8410 else {
8411 /* We do not need to compare 0 and len(substring)-1 because
8412 the if statement above ensured already that they are equal
8413 when we end up here. */
8414 // TODO: honor direction and do a forward or backwards search
8415 for (i = 1; i < end_sub; ++i) {
8416 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8417 PyUnicode_READ(kind_sub, data_sub, i))
8418 return 0;
8419 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 }
8423
8424 return 0;
8425}
8426
Alexander Belopolsky40018472011-02-26 01:02:56 +00008427Py_ssize_t
8428PyUnicode_Tailmatch(PyObject *str,
8429 PyObject *substr,
8430 Py_ssize_t start,
8431 Py_ssize_t end,
8432 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008434 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008435
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436 str = PyUnicode_FromObject(str);
8437 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439 substr = PyUnicode_FromObject(substr);
8440 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 Py_DECREF(str);
8442 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 }
Tim Petersced69f82003-09-16 20:30:58 +00008444
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 (PyUnicodeObject *)substr,
8447 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 Py_DECREF(str);
8449 Py_DECREF(substr);
8450 return result;
8451}
8452
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453/* Apply fixfct filter to the Unicode object self and return a
8454 reference to the modified object */
8455
Alexander Belopolsky40018472011-02-26 01:02:56 +00008456static PyObject *
8457fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 PyObject *u;
8461 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 if (PyUnicode_READY(self) == -1)
8464 return NULL;
8465 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8466 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8467 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8472 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 /* fix functions return the new maximum character in a string,
8475 if the kind of the resulting unicode object does not change,
8476 everything is fine. Otherwise we need to change the string kind
8477 and re-run the fix function. */
8478 maxchar_new = fixfct((PyUnicodeObject*)u);
8479 if (maxchar_new == 0)
8480 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8481 else if (maxchar_new <= 127)
8482 maxchar_new = 127;
8483 else if (maxchar_new <= 255)
8484 maxchar_new = 255;
8485 else if (maxchar_new <= 65535)
8486 maxchar_new = 65535;
8487 else
8488 maxchar_new = 1114111; /* 0x10ffff */
8489
8490 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* fixfct should return TRUE if it modified the buffer. If
8492 FALSE, return a reference to the original buffer instead
8493 (to save space, not time) */
8494 Py_INCREF(self);
8495 Py_DECREF(u);
8496 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 else if (maxchar_new == maxchar_old) {
8499 return u;
8500 }
8501 else {
8502 /* In case the maximum character changed, we need to
8503 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008504 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 if (v == NULL) {
8506 Py_DECREF(u);
8507 return NULL;
8508 }
8509 if (maxchar_new > maxchar_old) {
8510 /* If the maxchar increased so that the kind changed, not all
8511 characters are representable anymore and we need to fix the
8512 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008513 if (PyUnicode_CopyCharacters(v, 0,
8514 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008515 PyUnicode_GET_LENGTH(self)) < 0)
8516 {
8517 Py_DECREF(u);
8518 return NULL;
8519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 maxchar_old = fixfct((PyUnicodeObject*)v);
8521 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8522 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008523 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008524 if (PyUnicode_CopyCharacters(v, 0,
8525 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008526 PyUnicode_GET_LENGTH(self)) < 0)
8527 {
8528 Py_DECREF(u);
8529 return NULL;
8530 }
8531 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532
8533 Py_DECREF(u);
8534 return v;
8535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536}
8537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008539fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 /* No need to call PyUnicode_READY(self) because this function is only
8542 called as a callback from fixup() which does it already. */
8543 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8544 const int kind = PyUnicode_KIND(self);
8545 void *data = PyUnicode_DATA(self);
8546 int touched = 0;
8547 Py_UCS4 maxchar = 0;
8548 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 for (i = 0; i < len; ++i) {
8551 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8552 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8553 if (up != ch) {
8554 if (up > maxchar)
8555 maxchar = up;
8556 PyUnicode_WRITE(kind, data, i, up);
8557 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 else if (ch > maxchar)
8560 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 }
8562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 if (touched)
8564 return maxchar;
8565 else
8566 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567}
8568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008570fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8573 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8574 const int kind = PyUnicode_KIND(self);
8575 void *data = PyUnicode_DATA(self);
8576 int touched = 0;
8577 Py_UCS4 maxchar = 0;
8578 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 for(i = 0; i < len; ++i) {
8581 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8582 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8583 if (lo != ch) {
8584 if (lo > maxchar)
8585 maxchar = lo;
8586 PyUnicode_WRITE(kind, data, i, lo);
8587 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 else if (ch > maxchar)
8590 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 }
8592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 if (touched)
8594 return maxchar;
8595 else
8596 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597}
8598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008600fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8603 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8604 const int kind = PyUnicode_KIND(self);
8605 void *data = PyUnicode_DATA(self);
8606 int touched = 0;
8607 Py_UCS4 maxchar = 0;
8608 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 for(i = 0; i < len; ++i) {
8611 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8612 Py_UCS4 nu = 0;
8613
8614 if (Py_UNICODE_ISUPPER(ch))
8615 nu = Py_UNICODE_TOLOWER(ch);
8616 else if (Py_UNICODE_ISLOWER(ch))
8617 nu = Py_UNICODE_TOUPPER(ch);
8618
8619 if (nu != 0) {
8620 if (nu > maxchar)
8621 maxchar = nu;
8622 PyUnicode_WRITE(kind, data, i, nu);
8623 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 else if (ch > maxchar)
8626 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 }
8628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 if (touched)
8630 return maxchar;
8631 else
8632 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633}
8634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008636fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8639 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8640 const int kind = PyUnicode_KIND(self);
8641 void *data = PyUnicode_DATA(self);
8642 int touched = 0;
8643 Py_UCS4 maxchar = 0;
8644 Py_ssize_t i = 0;
8645 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008646
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008647 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649
8650 ch = PyUnicode_READ(kind, data, i);
8651 if (!Py_UNICODE_ISUPPER(ch)) {
8652 maxchar = Py_UNICODE_TOUPPER(ch);
8653 PyUnicode_WRITE(kind, data, i, maxchar);
8654 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 ++i;
8657 for(; i < len; ++i) {
8658 ch = PyUnicode_READ(kind, data, i);
8659 if (!Py_UNICODE_ISLOWER(ch)) {
8660 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8661 if (lo > maxchar)
8662 maxchar = lo;
8663 PyUnicode_WRITE(kind, data, i, lo);
8664 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 else if (ch > maxchar)
8667 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669
8670 if (touched)
8671 return maxchar;
8672 else
8673 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674}
8675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008677fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8680 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8681 const int kind = PyUnicode_KIND(self);
8682 void *data = PyUnicode_DATA(self);
8683 Py_UCS4 maxchar = 0;
8684 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 int previous_is_cased;
8686
8687 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 if (len == 1) {
8689 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8690 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8691 if (ti != ch) {
8692 PyUnicode_WRITE(kind, data, i, ti);
8693 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 }
8695 else
8696 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 for(; i < len; ++i) {
8700 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8701 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008702
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 nu = Py_UNICODE_TOTITLE(ch);
8707
8708 if (nu > maxchar)
8709 maxchar = nu;
8710 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008711
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 if (Py_UNICODE_ISLOWER(ch) ||
8713 Py_UNICODE_ISUPPER(ch) ||
8714 Py_UNICODE_ISTITLE(ch))
8715 previous_is_cased = 1;
8716 else
8717 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720}
8721
Tim Peters8ce9f162004-08-27 01:49:32 +00008722PyObject *
8723PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008726 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008728 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008729 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8730 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008731 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 Py_ssize_t sz, i, res_offset;
8733 Py_UCS4 maxchar = 0;
8734 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735
Tim Peters05eba1f2004-08-27 21:32:02 +00008736 fseq = PySequence_Fast(seq, "");
8737 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008738 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008739 }
8740
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008741 /* NOTE: the following code can't call back into Python code,
8742 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008743 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008744
Tim Peters05eba1f2004-08-27 21:32:02 +00008745 seqlen = PySequence_Fast_GET_SIZE(fseq);
8746 /* If empty sequence, return u"". */
8747 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008749 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008750 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008751 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008752 /* If singleton sequence with an exact Unicode, return that. */
8753 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 item = items[0];
8755 if (PyUnicode_CheckExact(item)) {
8756 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 goto Done;
8759 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008760 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008761 else {
8762 /* Set up sep and seplen */
8763 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 /* fall back to a blank space separator */
8765 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008766 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008768 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008769 else {
8770 if (!PyUnicode_Check(separator)) {
8771 PyErr_Format(PyExc_TypeError,
8772 "separator: expected str instance,"
8773 " %.80s found",
8774 Py_TYPE(separator)->tp_name);
8775 goto onError;
8776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 if (PyUnicode_READY(separator) == -1)
8778 goto onError;
8779 sep = separator;
8780 seplen = PyUnicode_GET_LENGTH(separator);
8781 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8782 /* inc refcount to keep this code path symetric with the
8783 above case of a blank separator */
8784 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008785 }
8786 }
8787
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008788 /* There are at least two things to join, or else we have a subclass
8789 * of str in the sequence.
8790 * Do a pre-pass to figure out the total amount of space we'll
8791 * need (sz), and see whether all argument are strings.
8792 */
8793 sz = 0;
8794 for (i = 0; i < seqlen; i++) {
8795 const Py_ssize_t old_sz = sz;
8796 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 if (!PyUnicode_Check(item)) {
8798 PyErr_Format(PyExc_TypeError,
8799 "sequence item %zd: expected str instance,"
8800 " %.80s found",
8801 i, Py_TYPE(item)->tp_name);
8802 goto onError;
8803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 if (PyUnicode_READY(item) == -1)
8805 goto onError;
8806 sz += PyUnicode_GET_LENGTH(item);
8807 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8808 if (item_maxchar > maxchar)
8809 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008810 if (i != 0)
8811 sz += seplen;
8812 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8813 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008815 goto onError;
8816 }
8817 }
Tim Petersced69f82003-09-16 20:30:58 +00008818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008820 if (res == NULL)
8821 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008822
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008823 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008825 Py_ssize_t itemlen;
8826 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 /* Copy item, and maybe the separator. */
8829 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008830 if (PyUnicode_CopyCharacters(res, res_offset,
8831 sep, 0, seplen) < 0)
8832 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008835 if (PyUnicode_CopyCharacters(res, res_offset,
8836 item, 0, itemlen) < 0)
8837 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008841
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008843 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 Py_XDECREF(sep);
8845 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008848 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008850 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 return NULL;
8852}
8853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854#define FILL(kind, data, value, start, length) \
8855 do { \
8856 Py_ssize_t i_ = 0; \
8857 assert(kind != PyUnicode_WCHAR_KIND); \
8858 switch ((kind)) { \
8859 case PyUnicode_1BYTE_KIND: { \
8860 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8861 memset(to_, (unsigned char)value, length); \
8862 break; \
8863 } \
8864 case PyUnicode_2BYTE_KIND: { \
8865 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8866 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8867 break; \
8868 } \
8869 default: { \
8870 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8871 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8872 break; \
8873 } \
8874 } \
8875 } while (0)
8876
Alexander Belopolsky40018472011-02-26 01:02:56 +00008877static PyUnicodeObject *
8878pad(PyUnicodeObject *self,
8879 Py_ssize_t left,
8880 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 PyObject *u;
8884 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008885 int kind;
8886 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887
8888 if (left < 0)
8889 left = 0;
8890 if (right < 0)
8891 right = 0;
8892
Tim Peters7a29bd52001-09-12 03:03:31 +00008893 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894 Py_INCREF(self);
8895 return self;
8896 }
8897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8899 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008900 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8901 return NULL;
8902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8904 if (fill > maxchar)
8905 maxchar = fill;
8906 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008907 if (!u)
8908 return NULL;
8909
8910 kind = PyUnicode_KIND(u);
8911 data = PyUnicode_DATA(u);
8912 if (left)
8913 FILL(kind, data, fill, 0, left);
8914 if (right)
8915 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008916 if (PyUnicode_CopyCharacters(u, left,
8917 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008918 _PyUnicode_LENGTH(self)) < 0)
8919 {
8920 Py_DECREF(u);
8921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922 }
8923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927
Alexander Belopolsky40018472011-02-26 01:02:56 +00008928PyObject *
8929PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932
8933 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 switch(PyUnicode_KIND(string)) {
8938 case PyUnicode_1BYTE_KIND:
8939 list = ucs1lib_splitlines(
8940 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8941 PyUnicode_GET_LENGTH(string), keepends);
8942 break;
8943 case PyUnicode_2BYTE_KIND:
8944 list = ucs2lib_splitlines(
8945 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8946 PyUnicode_GET_LENGTH(string), keepends);
8947 break;
8948 case PyUnicode_4BYTE_KIND:
8949 list = ucs4lib_splitlines(
8950 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8951 PyUnicode_GET_LENGTH(string), keepends);
8952 break;
8953 default:
8954 assert(0);
8955 list = 0;
8956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957 Py_DECREF(string);
8958 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
8960
Alexander Belopolsky40018472011-02-26 01:02:56 +00008961static PyObject *
8962split(PyUnicodeObject *self,
8963 PyUnicodeObject *substring,
8964 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 int kind1, kind2, kind;
8967 void *buf1, *buf2;
8968 Py_ssize_t len1, len2;
8969 PyObject* out;
8970
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008972 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 if (PyUnicode_READY(self) == -1)
8975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 if (substring == NULL)
8978 switch(PyUnicode_KIND(self)) {
8979 case PyUnicode_1BYTE_KIND:
8980 return ucs1lib_split_whitespace(
8981 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8982 PyUnicode_GET_LENGTH(self), maxcount
8983 );
8984 case PyUnicode_2BYTE_KIND:
8985 return ucs2lib_split_whitespace(
8986 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8987 PyUnicode_GET_LENGTH(self), maxcount
8988 );
8989 case PyUnicode_4BYTE_KIND:
8990 return ucs4lib_split_whitespace(
8991 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8992 PyUnicode_GET_LENGTH(self), maxcount
8993 );
8994 default:
8995 assert(0);
8996 return NULL;
8997 }
8998
8999 if (PyUnicode_READY(substring) == -1)
9000 return NULL;
9001
9002 kind1 = PyUnicode_KIND(self);
9003 kind2 = PyUnicode_KIND(substring);
9004 kind = kind1 > kind2 ? kind1 : kind2;
9005 buf1 = PyUnicode_DATA(self);
9006 buf2 = PyUnicode_DATA(substring);
9007 if (kind1 != kind)
9008 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9009 if (!buf1)
9010 return NULL;
9011 if (kind2 != kind)
9012 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9013 if (!buf2) {
9014 if (kind1 != kind) PyMem_Free(buf1);
9015 return NULL;
9016 }
9017 len1 = PyUnicode_GET_LENGTH(self);
9018 len2 = PyUnicode_GET_LENGTH(substring);
9019
9020 switch(kind) {
9021 case PyUnicode_1BYTE_KIND:
9022 out = ucs1lib_split(
9023 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9024 break;
9025 case PyUnicode_2BYTE_KIND:
9026 out = ucs2lib_split(
9027 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9028 break;
9029 case PyUnicode_4BYTE_KIND:
9030 out = ucs4lib_split(
9031 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9032 break;
9033 default:
9034 out = NULL;
9035 }
9036 if (kind1 != kind)
9037 PyMem_Free(buf1);
9038 if (kind2 != kind)
9039 PyMem_Free(buf2);
9040 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
9042
Alexander Belopolsky40018472011-02-26 01:02:56 +00009043static PyObject *
9044rsplit(PyUnicodeObject *self,
9045 PyUnicodeObject *substring,
9046 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009047{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 int kind1, kind2, kind;
9049 void *buf1, *buf2;
9050 Py_ssize_t len1, len2;
9051 PyObject* out;
9052
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009053 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009054 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 if (PyUnicode_READY(self) == -1)
9057 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 if (substring == NULL)
9060 switch(PyUnicode_KIND(self)) {
9061 case PyUnicode_1BYTE_KIND:
9062 return ucs1lib_rsplit_whitespace(
9063 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9064 PyUnicode_GET_LENGTH(self), maxcount
9065 );
9066 case PyUnicode_2BYTE_KIND:
9067 return ucs2lib_rsplit_whitespace(
9068 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9069 PyUnicode_GET_LENGTH(self), maxcount
9070 );
9071 case PyUnicode_4BYTE_KIND:
9072 return ucs4lib_rsplit_whitespace(
9073 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9074 PyUnicode_GET_LENGTH(self), maxcount
9075 );
9076 default:
9077 assert(0);
9078 return NULL;
9079 }
9080
9081 if (PyUnicode_READY(substring) == -1)
9082 return NULL;
9083
9084 kind1 = PyUnicode_KIND(self);
9085 kind2 = PyUnicode_KIND(substring);
9086 kind = kind1 > kind2 ? kind1 : kind2;
9087 buf1 = PyUnicode_DATA(self);
9088 buf2 = PyUnicode_DATA(substring);
9089 if (kind1 != kind)
9090 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9091 if (!buf1)
9092 return NULL;
9093 if (kind2 != kind)
9094 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9095 if (!buf2) {
9096 if (kind1 != kind) PyMem_Free(buf1);
9097 return NULL;
9098 }
9099 len1 = PyUnicode_GET_LENGTH(self);
9100 len2 = PyUnicode_GET_LENGTH(substring);
9101
9102 switch(kind) {
9103 case PyUnicode_1BYTE_KIND:
9104 out = ucs1lib_rsplit(
9105 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9106 break;
9107 case PyUnicode_2BYTE_KIND:
9108 out = ucs2lib_rsplit(
9109 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9110 break;
9111 case PyUnicode_4BYTE_KIND:
9112 out = ucs4lib_rsplit(
9113 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9114 break;
9115 default:
9116 out = NULL;
9117 }
9118 if (kind1 != kind)
9119 PyMem_Free(buf1);
9120 if (kind2 != kind)
9121 PyMem_Free(buf2);
9122 return out;
9123}
9124
9125static Py_ssize_t
9126anylib_find(int kind, void *buf1, Py_ssize_t len1,
9127 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9128{
9129 switch(kind) {
9130 case PyUnicode_1BYTE_KIND:
9131 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9132 case PyUnicode_2BYTE_KIND:
9133 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9134 case PyUnicode_4BYTE_KIND:
9135 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9136 }
9137 assert(0);
9138 return -1;
9139}
9140
9141static Py_ssize_t
9142anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9143 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9144{
9145 switch(kind) {
9146 case PyUnicode_1BYTE_KIND:
9147 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9148 case PyUnicode_2BYTE_KIND:
9149 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9150 case PyUnicode_4BYTE_KIND:
9151 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9152 }
9153 assert(0);
9154 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009155}
9156
Alexander Belopolsky40018472011-02-26 01:02:56 +00009157static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158replace(PyObject *self, PyObject *str1,
9159 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 PyObject *u;
9162 char *sbuf = PyUnicode_DATA(self);
9163 char *buf1 = PyUnicode_DATA(str1);
9164 char *buf2 = PyUnicode_DATA(str2);
9165 int srelease = 0, release1 = 0, release2 = 0;
9166 int skind = PyUnicode_KIND(self);
9167 int kind1 = PyUnicode_KIND(str1);
9168 int kind2 = PyUnicode_KIND(str2);
9169 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9170 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9171 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
9173 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009174 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009176 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 if (skind < kind1)
9179 /* substring too wide to be present */
9180 goto nothing;
9181
9182 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009183 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009184 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009186 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009188 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 Py_UCS4 u1, u2, maxchar;
9190 int mayshrink, rkind;
9191 u1 = PyUnicode_READ_CHAR(str1, 0);
9192 if (!findchar(sbuf, PyUnicode_KIND(self),
9193 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009194 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 u2 = PyUnicode_READ_CHAR(str2, 0);
9196 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9197 /* Replacing u1 with u2 may cause a maxchar reduction in the
9198 result string. */
9199 mayshrink = maxchar > 127;
9200 if (u2 > maxchar) {
9201 maxchar = u2;
9202 mayshrink = 0;
9203 }
9204 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009205 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009207 if (PyUnicode_CopyCharacters(u, 0,
9208 (PyObject*)self, 0, slen) < 0)
9209 {
9210 Py_DECREF(u);
9211 return NULL;
9212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 rkind = PyUnicode_KIND(u);
9214 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9215 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009216 if (--maxcount < 0)
9217 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 if (mayshrink) {
9221 PyObject *tmp = u;
9222 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9223 PyUnicode_GET_LENGTH(tmp));
9224 Py_DECREF(tmp);
9225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 int rkind = skind;
9228 char *res;
9229 if (kind1 < rkind) {
9230 /* widen substring */
9231 buf1 = _PyUnicode_AsKind(str1, rkind);
9232 if (!buf1) goto error;
9233 release1 = 1;
9234 }
9235 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009236 if (i < 0)
9237 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 if (rkind > kind2) {
9239 /* widen replacement */
9240 buf2 = _PyUnicode_AsKind(str2, rkind);
9241 if (!buf2) goto error;
9242 release2 = 1;
9243 }
9244 else if (rkind < kind2) {
9245 /* widen self and buf1 */
9246 rkind = kind2;
9247 if (release1) PyMem_Free(buf1);
9248 sbuf = _PyUnicode_AsKind(self, rkind);
9249 if (!sbuf) goto error;
9250 srelease = 1;
9251 buf1 = _PyUnicode_AsKind(str1, rkind);
9252 if (!buf1) goto error;
9253 release1 = 1;
9254 }
9255 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9256 if (!res) {
9257 PyErr_NoMemory();
9258 goto error;
9259 }
9260 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009261 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9263 buf2,
9264 PyUnicode_KIND_SIZE(rkind, len2));
9265 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009266
9267 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9269 slen-i,
9270 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009271 if (i == -1)
9272 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9274 buf2,
9275 PyUnicode_KIND_SIZE(rkind, len2));
9276 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009277 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278
9279 u = PyUnicode_FromKindAndData(rkind, res, slen);
9280 PyMem_Free(res);
9281 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 Py_ssize_t n, i, j, ires;
9286 Py_ssize_t product, new_size;
9287 int rkind = skind;
9288 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 if (kind1 < rkind) {
9291 buf1 = _PyUnicode_AsKind(str1, rkind);
9292 if (!buf1) goto error;
9293 release1 = 1;
9294 }
9295 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009296 if (n == 0)
9297 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (kind2 < rkind) {
9299 buf2 = _PyUnicode_AsKind(str2, rkind);
9300 if (!buf2) goto error;
9301 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 else if (kind2 > rkind) {
9304 rkind = kind2;
9305 sbuf = _PyUnicode_AsKind(self, rkind);
9306 if (!sbuf) goto error;
9307 srelease = 1;
9308 if (release1) PyMem_Free(buf1);
9309 buf1 = _PyUnicode_AsKind(str1, rkind);
9310 if (!buf1) goto error;
9311 release1 = 1;
9312 }
9313 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9314 PyUnicode_GET_LENGTH(str1))); */
9315 product = n * (len2-len1);
9316 if ((product / (len2-len1)) != n) {
9317 PyErr_SetString(PyExc_OverflowError,
9318 "replace string is too long");
9319 goto error;
9320 }
9321 new_size = slen + product;
9322 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9323 PyErr_SetString(PyExc_OverflowError,
9324 "replace string is too long");
9325 goto error;
9326 }
9327 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9328 if (!res)
9329 goto error;
9330 ires = i = 0;
9331 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009332 while (n-- > 0) {
9333 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 j = anylib_find(rkind,
9335 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9336 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009337 if (j == -1)
9338 break;
9339 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009340 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9342 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9343 PyUnicode_KIND_SIZE(rkind, j-i));
9344 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009345 }
9346 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 if (len2 > 0) {
9348 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9349 buf2,
9350 PyUnicode_KIND_SIZE(rkind, len2));
9351 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009356 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9358 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9359 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009360 } else {
9361 /* interleave */
9362 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9364 buf2,
9365 PyUnicode_KIND_SIZE(rkind, len2));
9366 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009367 if (--n <= 0)
9368 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9370 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9371 PyUnicode_KIND_SIZE(rkind, 1));
9372 ires++;
9373 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9376 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9377 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009380 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 if (srelease)
9383 PyMem_FREE(sbuf);
9384 if (release1)
9385 PyMem_FREE(buf1);
9386 if (release2)
9387 PyMem_FREE(buf2);
9388 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009389
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009391 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 if (srelease)
9393 PyMem_FREE(sbuf);
9394 if (release1)
9395 PyMem_FREE(buf1);
9396 if (release2)
9397 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009398 if (PyUnicode_CheckExact(self)) {
9399 Py_INCREF(self);
9400 return (PyObject *) self;
9401 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009402 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 error:
9404 if (srelease && sbuf)
9405 PyMem_FREE(sbuf);
9406 if (release1 && buf1)
9407 PyMem_FREE(buf1);
9408 if (release2 && buf2)
9409 PyMem_FREE(buf2);
9410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411}
9412
9413/* --- Unicode Object Methods --------------------------------------------- */
9414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009415PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417\n\
9418Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009419characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420
9421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009422unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424 return fixup(self, fixtitle);
9425}
9426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009427PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429\n\
9430Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009431have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432
9433static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009434unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436 return fixup(self, fixcapitalize);
9437}
9438
9439#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009440PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009441 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442\n\
9443Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009444normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445
9446static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009447unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
9449 PyObject *list;
9450 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009451 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 /* Split into words */
9454 list = split(self, NULL, -1);
9455 if (!list)
9456 return NULL;
9457
9458 /* Capitalize each word */
9459 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9460 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462 if (item == NULL)
9463 goto onError;
9464 Py_DECREF(PyList_GET_ITEM(list, i));
9465 PyList_SET_ITEM(list, i, item);
9466 }
9467
9468 /* Join the words to form a new string */
9469 item = PyUnicode_Join(NULL, list);
9470
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 Py_DECREF(list);
9473 return (PyObject *)item;
9474}
9475#endif
9476
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009477/* Argument converter. Coerces to a single unicode character */
9478
9479static int
9480convert_uc(PyObject *obj, void *addr)
9481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009483 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009484
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 uniobj = PyUnicode_FromObject(obj);
9486 if (uniobj == NULL) {
9487 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009489 return 0;
9490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009492 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009494 Py_DECREF(uniobj);
9495 return 0;
9496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009498 Py_DECREF(uniobj);
9499 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009500}
9501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009502PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009505Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009506done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507
9508static PyObject *
9509unicode_center(PyUnicodeObject *self, PyObject *args)
9510{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009511 Py_ssize_t marg, left;
9512 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 Py_UCS4 fillchar = ' ';
9514
Victor Stinnere9a29352011-10-01 02:14:59 +02009515 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
Victor Stinnere9a29352011-10-01 02:14:59 +02009518 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 return NULL;
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 Py_INCREF(self);
9523 return (PyObject*) self;
9524 }
9525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 left = marg / 2 + (marg & width & 1);
9528
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009529 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530}
9531
Marc-André Lemburge5034372000-08-08 08:04:29 +00009532#if 0
9533
9534/* This code should go into some future Unicode collation support
9535 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009536 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009537
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009538/* speedy UTF-16 code point order comparison */
9539/* gleaned from: */
9540/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9541
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009542static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009543{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009544 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009545 0, 0, 0, 0, 0, 0, 0, 0,
9546 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009547 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009548};
9549
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550static int
9551unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9552{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009553 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009554
Guido van Rossumd57fd912000-03-10 22:53:23 +00009555 Py_UNICODE *s1 = str1->str;
9556 Py_UNICODE *s2 = str2->str;
9557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 len1 = str1->_base._base.length;
9559 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009562 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009563
9564 c1 = *s1++;
9565 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009566
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 if (c1 > (1<<11) * 26)
9568 c1 += utf16Fixup[c1>>11];
9569 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009570 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009571 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009572
9573 if (c1 != c2)
9574 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009575
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009576 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 }
9578
9579 return (len1 < len2) ? -1 : (len1 != len2);
9580}
9581
Marc-André Lemburge5034372000-08-08 08:04:29 +00009582#else
9583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584/* This function assumes that str1 and str2 are readied by the caller. */
9585
Marc-André Lemburge5034372000-08-08 08:04:29 +00009586static int
9587unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 int kind1, kind2;
9590 void *data1, *data2;
9591 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 kind1 = PyUnicode_KIND(str1);
9594 kind2 = PyUnicode_KIND(str2);
9595 data1 = PyUnicode_DATA(str1);
9596 data2 = PyUnicode_DATA(str2);
9597 len1 = PyUnicode_GET_LENGTH(str1);
9598 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 for (i = 0; i < len1 && i < len2; ++i) {
9601 Py_UCS4 c1, c2;
9602 c1 = PyUnicode_READ(kind1, data1, i);
9603 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009604
9605 if (c1 != c2)
9606 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009607 }
9608
9609 return (len1 < len2) ? -1 : (len1 != len2);
9610}
9611
9612#endif
9613
Alexander Belopolsky40018472011-02-26 01:02:56 +00009614int
9615PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9618 if (PyUnicode_READY(left) == -1 ||
9619 PyUnicode_READY(right) == -1)
9620 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009621 return unicode_compare((PyUnicodeObject *)left,
9622 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009624 PyErr_Format(PyExc_TypeError,
9625 "Can't compare %.100s and %.100s",
9626 left->ob_type->tp_name,
9627 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 return -1;
9629}
9630
Martin v. Löwis5b222132007-06-10 09:51:05 +00009631int
9632PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 Py_ssize_t i;
9635 int kind;
9636 void *data;
9637 Py_UCS4 chr;
9638
Victor Stinner910337b2011-10-03 03:20:16 +02009639 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 if (PyUnicode_READY(uni) == -1)
9641 return -1;
9642 kind = PyUnicode_KIND(uni);
9643 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009644 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9646 if (chr != str[i])
9647 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009648 /* This check keeps Python strings that end in '\0' from comparing equal
9649 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009651 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009652 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009653 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009654 return 0;
9655}
9656
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009657
Benjamin Peterson29060642009-01-31 22:14:21 +00009658#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009659 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009660
Alexander Belopolsky40018472011-02-26 01:02:56 +00009661PyObject *
9662PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009663{
9664 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009665
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009666 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9667 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 if (PyUnicode_READY(left) == -1 ||
9669 PyUnicode_READY(right) == -1)
9670 return NULL;
9671 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9672 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009673 if (op == Py_EQ) {
9674 Py_INCREF(Py_False);
9675 return Py_False;
9676 }
9677 if (op == Py_NE) {
9678 Py_INCREF(Py_True);
9679 return Py_True;
9680 }
9681 }
9682 if (left == right)
9683 result = 0;
9684 else
9685 result = unicode_compare((PyUnicodeObject *)left,
9686 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009687
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009688 /* Convert the return value to a Boolean */
9689 switch (op) {
9690 case Py_EQ:
9691 v = TEST_COND(result == 0);
9692 break;
9693 case Py_NE:
9694 v = TEST_COND(result != 0);
9695 break;
9696 case Py_LE:
9697 v = TEST_COND(result <= 0);
9698 break;
9699 case Py_GE:
9700 v = TEST_COND(result >= 0);
9701 break;
9702 case Py_LT:
9703 v = TEST_COND(result == -1);
9704 break;
9705 case Py_GT:
9706 v = TEST_COND(result == 1);
9707 break;
9708 default:
9709 PyErr_BadArgument();
9710 return NULL;
9711 }
9712 Py_INCREF(v);
9713 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009715
Brian Curtindfc80e32011-08-10 20:28:54 -05009716 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009717}
9718
Alexander Belopolsky40018472011-02-26 01:02:56 +00009719int
9720PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009721{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009722 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 int kind1, kind2, kind;
9724 void *buf1, *buf2;
9725 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009726 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009727
9728 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009729 sub = PyUnicode_FromObject(element);
9730 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009731 PyErr_Format(PyExc_TypeError,
9732 "'in <string>' requires string as left operand, not %s",
9733 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009734 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 if (PyUnicode_READY(sub) == -1)
9737 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009738
Thomas Wouters477c8d52006-05-27 19:21:47 +00009739 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009740 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009741 Py_DECREF(sub);
9742 return -1;
9743 }
9744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 kind1 = PyUnicode_KIND(str);
9746 kind2 = PyUnicode_KIND(sub);
9747 kind = kind1 > kind2 ? kind1 : kind2;
9748 buf1 = PyUnicode_DATA(str);
9749 buf2 = PyUnicode_DATA(sub);
9750 if (kind1 != kind)
9751 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9752 if (!buf1) {
9753 Py_DECREF(sub);
9754 return -1;
9755 }
9756 if (kind2 != kind)
9757 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9758 if (!buf2) {
9759 Py_DECREF(sub);
9760 if (kind1 != kind) PyMem_Free(buf1);
9761 return -1;
9762 }
9763 len1 = PyUnicode_GET_LENGTH(str);
9764 len2 = PyUnicode_GET_LENGTH(sub);
9765
9766 switch(kind) {
9767 case PyUnicode_1BYTE_KIND:
9768 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9769 break;
9770 case PyUnicode_2BYTE_KIND:
9771 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9772 break;
9773 case PyUnicode_4BYTE_KIND:
9774 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9775 break;
9776 default:
9777 result = -1;
9778 assert(0);
9779 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009780
9781 Py_DECREF(str);
9782 Py_DECREF(sub);
9783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 if (kind1 != kind)
9785 PyMem_Free(buf1);
9786 if (kind2 != kind)
9787 PyMem_Free(buf2);
9788
Guido van Rossum403d68b2000-03-13 15:55:09 +00009789 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009790}
9791
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792/* Concat to string or Unicode object giving a new Unicode object. */
9793
Alexander Belopolsky40018472011-02-26 01:02:56 +00009794PyObject *
9795PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 PyObject *u = NULL, *v = NULL, *w;
9798 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799
9800 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009803 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009806 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807
9808 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009809 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009810 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009813 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816 }
9817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009819 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 w = PyUnicode_New(
9823 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9824 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009826 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009827 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9828 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009829 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009830 v, 0,
9831 PyUnicode_GET_LENGTH(v)) < 0)
9832 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833 Py_DECREF(u);
9834 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836
Benjamin Peterson29060642009-01-31 22:14:21 +00009837 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838 Py_XDECREF(u);
9839 Py_XDECREF(v);
9840 return NULL;
9841}
9842
Walter Dörwald1ab83302007-05-18 17:15:44 +00009843void
Victor Stinner23e56682011-10-03 03:54:37 +02009844PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009845{
Victor Stinner23e56682011-10-03 03:54:37 +02009846 PyObject *left, *res;
9847
9848 if (p_left == NULL) {
9849 if (!PyErr_Occurred())
9850 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009851 return;
9852 }
Victor Stinner23e56682011-10-03 03:54:37 +02009853 left = *p_left;
9854 if (right == NULL || !PyUnicode_Check(left)) {
9855 if (!PyErr_Occurred())
9856 PyErr_BadInternalCall();
9857 goto error;
9858 }
9859
9860 if (PyUnicode_CheckExact(left) && left != unicode_empty
9861 && PyUnicode_CheckExact(right) && right != unicode_empty
9862 && unicode_resizable(left)
9863 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9864 || _PyUnicode_WSTR(left) != NULL))
9865 {
9866 Py_ssize_t u_len, v_len, new_len, copied;
9867
9868 /* FIXME: don't make wstr string ready */
9869 if (PyUnicode_READY(left))
9870 goto error;
9871 if (PyUnicode_READY(right))
9872 goto error;
9873
9874 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9875 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9876 {
9877 u_len = PyUnicode_GET_LENGTH(left);
9878 v_len = PyUnicode_GET_LENGTH(right);
9879 if (u_len > PY_SSIZE_T_MAX - v_len) {
9880 PyErr_SetString(PyExc_OverflowError,
9881 "strings are too large to concat");
9882 goto error;
9883 }
9884 new_len = u_len + v_len;
9885
9886 /* Now we own the last reference to 'left', so we can resize it
9887 * in-place.
9888 */
9889 if (unicode_resize(&left, new_len) != 0) {
9890 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9891 * deallocated so it cannot be put back into
9892 * 'variable'. The MemoryError is raised when there
9893 * is no value in 'variable', which might (very
9894 * remotely) be a cause of incompatibilities.
9895 */
9896 goto error;
9897 }
9898 /* copy 'right' into the newly allocated area of 'left' */
9899 copied = PyUnicode_CopyCharacters(left, u_len,
9900 right, 0,
9901 v_len);
9902 assert(0 <= copied);
9903 *p_left = left;
9904 return;
9905 }
9906 }
9907
9908 res = PyUnicode_Concat(left, right);
9909 if (res == NULL)
9910 goto error;
9911 Py_DECREF(left);
9912 *p_left = res;
9913 return;
9914
9915error:
9916 Py_DECREF(*p_left);
9917 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009918}
9919
9920void
9921PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9922{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009923 PyUnicode_Append(pleft, right);
9924 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009925}
9926
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009927PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009930Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009931string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009932interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
9934static PyObject *
9935unicode_count(PyUnicodeObject *self, PyObject *args)
9936{
9937 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009938 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009939 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 int kind1, kind2, kind;
9942 void *buf1, *buf2;
9943 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009944
Jesus Ceaac451502011-04-20 17:09:23 +02009945 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9946 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 kind1 = PyUnicode_KIND(self);
9950 kind2 = PyUnicode_KIND(substring);
9951 kind = kind1 > kind2 ? kind1 : kind2;
9952 buf1 = PyUnicode_DATA(self);
9953 buf2 = PyUnicode_DATA(substring);
9954 if (kind1 != kind)
9955 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9956 if (!buf1) {
9957 Py_DECREF(substring);
9958 return NULL;
9959 }
9960 if (kind2 != kind)
9961 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9962 if (!buf2) {
9963 Py_DECREF(substring);
9964 if (kind1 != kind) PyMem_Free(buf1);
9965 return NULL;
9966 }
9967 len1 = PyUnicode_GET_LENGTH(self);
9968 len2 = PyUnicode_GET_LENGTH(substring);
9969
9970 ADJUST_INDICES(start, end, len1);
9971 switch(kind) {
9972 case PyUnicode_1BYTE_KIND:
9973 iresult = ucs1lib_count(
9974 ((Py_UCS1*)buf1) + start, end - start,
9975 buf2, len2, PY_SSIZE_T_MAX
9976 );
9977 break;
9978 case PyUnicode_2BYTE_KIND:
9979 iresult = ucs2lib_count(
9980 ((Py_UCS2*)buf1) + start, end - start,
9981 buf2, len2, PY_SSIZE_T_MAX
9982 );
9983 break;
9984 case PyUnicode_4BYTE_KIND:
9985 iresult = ucs4lib_count(
9986 ((Py_UCS4*)buf1) + start, end - start,
9987 buf2, len2, PY_SSIZE_T_MAX
9988 );
9989 break;
9990 default:
9991 assert(0); iresult = 0;
9992 }
9993
9994 result = PyLong_FromSsize_t(iresult);
9995
9996 if (kind1 != kind)
9997 PyMem_Free(buf1);
9998 if (kind2 != kind)
9999 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
10001 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010002
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003 return result;
10004}
10005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010006PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010007 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010009Encode S using the codec registered for encoding. Default encoding\n\
10010is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010011handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010012a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10013'xmlcharrefreplace' as well as any other name registered with\n\
10014codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
10016static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010017unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010019 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 char *encoding = NULL;
10021 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010022
Benjamin Peterson308d6372009-09-18 21:42:35 +000010023 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10024 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010026 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010027}
10028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010029PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031\n\
10032Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010033If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
10035static PyObject*
10036unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10037{
10038 Py_UNICODE *e;
10039 Py_UNICODE *p;
10040 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010041 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 PyUnicodeObject *u;
10044 int tabsize = 8;
10045
10046 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010047 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10050 return NULL;
10051
Thomas Wouters7e474022000-07-16 12:04:32 +000010052 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010053 i = 0; /* chars up to and including most recent \n or \r */
10054 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10056 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 if (tabsize > 0) {
10059 incr = tabsize - (j % tabsize); /* cannot overflow */
10060 if (j > PY_SSIZE_T_MAX - incr)
10061 goto overflow1;
10062 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010063 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 if (j > PY_SSIZE_T_MAX - 1)
10067 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068 j++;
10069 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010070 if (i > PY_SSIZE_T_MAX - j)
10071 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010073 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010074 }
10075 }
10076
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010077 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010079
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080 /* Second pass: create output string and fill it */
10081 u = _PyUnicode_New(i + j);
10082 if (!u)
10083 return NULL;
10084
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010085 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 q = _PyUnicode_WSTR(u); /* next output char */
10087 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010091 if (tabsize > 0) {
10092 i = tabsize - (j % tabsize);
10093 j += i;
10094 while (i--) {
10095 if (q >= qe)
10096 goto overflow2;
10097 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010098 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010099 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010100 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 else {
10102 if (q >= qe)
10103 goto overflow2;
10104 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010105 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 if (*p == '\n' || *p == '\r')
10107 j = 0;
10108 }
10109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (PyUnicode_READY(u) == -1) {
10111 Py_DECREF(u);
10112 return NULL;
10113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010115
10116 overflow2:
10117 Py_DECREF(u);
10118 overflow1:
10119 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121}
10122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010123PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010124 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125\n\
10126Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010127such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128arguments start and end are interpreted as in slice notation.\n\
10129\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010130Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131
10132static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134{
Jesus Ceaac451502011-04-20 17:09:23 +020010135 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010136 Py_ssize_t start;
10137 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010138 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Jesus Ceaac451502011-04-20 17:09:23 +020010140 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10141 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 if (PyUnicode_READY(self) == -1)
10145 return NULL;
10146 if (PyUnicode_READY(substring) == -1)
10147 return NULL;
10148
10149 result = any_find_slice(
10150 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10151 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010152 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
10154 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (result == -2)
10157 return NULL;
10158
Christian Heimes217cfd12007-12-02 14:31:20 +000010159 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160}
10161
10162static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010163unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010165 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10166 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169}
10170
Guido van Rossumc2504932007-09-18 19:42:40 +000010171/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010172 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010173static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010174unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175{
Guido van Rossumc2504932007-09-18 19:42:40 +000010176 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010177 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (_PyUnicode_HASH(self) != -1)
10180 return _PyUnicode_HASH(self);
10181 if (PyUnicode_READY(self) == -1)
10182 return -1;
10183 len = PyUnicode_GET_LENGTH(self);
10184
10185 /* The hash function as a macro, gets expanded three times below. */
10186#define HASH(P) \
10187 x = (Py_uhash_t)*P << 7; \
10188 while (--len >= 0) \
10189 x = (1000003*x) ^ (Py_uhash_t)*P++;
10190
10191 switch (PyUnicode_KIND(self)) {
10192 case PyUnicode_1BYTE_KIND: {
10193 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10194 HASH(c);
10195 break;
10196 }
10197 case PyUnicode_2BYTE_KIND: {
10198 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10199 HASH(s);
10200 break;
10201 }
10202 default: {
10203 Py_UCS4 *l;
10204 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10205 "Impossible switch case in unicode_hash");
10206 l = PyUnicode_4BYTE_DATA(self);
10207 HASH(l);
10208 break;
10209 }
10210 }
10211 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10212
Guido van Rossumc2504932007-09-18 19:42:40 +000010213 if (x == -1)
10214 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010216 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010220PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010223Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
10225static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010228 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010229 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010230 Py_ssize_t start;
10231 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Jesus Ceaac451502011-04-20 17:09:23 +020010233 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10234 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 if (PyUnicode_READY(self) == -1)
10238 return NULL;
10239 if (PyUnicode_READY(substring) == -1)
10240 return NULL;
10241
10242 result = any_find_slice(
10243 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10244 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010245 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
10247 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (result == -2)
10250 return NULL;
10251
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252 if (result < 0) {
10253 PyErr_SetString(PyExc_ValueError, "substring not found");
10254 return NULL;
10255 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010256
Christian Heimes217cfd12007-12-02 14:31:20 +000010257 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258}
10259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010260PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010261 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010263Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010264at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
10266static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010267unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 Py_ssize_t i, length;
10270 int kind;
10271 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272 int cased;
10273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 if (PyUnicode_READY(self) == -1)
10275 return NULL;
10276 length = PyUnicode_GET_LENGTH(self);
10277 kind = PyUnicode_KIND(self);
10278 data = PyUnicode_DATA(self);
10279
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (length == 1)
10282 return PyBool_FromLong(
10283 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010285 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010287 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010288
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 for (i = 0; i < length; i++) {
10291 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010292
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10294 return PyBool_FromLong(0);
10295 else if (!cased && Py_UNICODE_ISLOWER(ch))
10296 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010298 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299}
10300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010301PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010302 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010304Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010305at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306
10307static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010308unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 Py_ssize_t i, length;
10311 int kind;
10312 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313 int cased;
10314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (PyUnicode_READY(self) == -1)
10316 return NULL;
10317 length = PyUnicode_GET_LENGTH(self);
10318 kind = PyUnicode_KIND(self);
10319 data = PyUnicode_DATA(self);
10320
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (length == 1)
10323 return PyBool_FromLong(
10324 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010326 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010328 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010329
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 for (i = 0; i < length; i++) {
10332 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010333
Benjamin Peterson29060642009-01-31 22:14:21 +000010334 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10335 return PyBool_FromLong(0);
10336 else if (!cased && Py_UNICODE_ISUPPER(ch))
10337 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010339 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340}
10341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010343 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010345Return True if S is a titlecased string and there is at least one\n\
10346character in S, i.e. upper- and titlecase characters may only\n\
10347follow uncased characters and lowercase characters only cased ones.\n\
10348Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349
10350static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010351unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_ssize_t i, length;
10354 int kind;
10355 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356 int cased, previous_is_cased;
10357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 if (PyUnicode_READY(self) == -1)
10359 return NULL;
10360 length = PyUnicode_GET_LENGTH(self);
10361 kind = PyUnicode_KIND(self);
10362 data = PyUnicode_DATA(self);
10363
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (length == 1) {
10366 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10367 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10368 (Py_UNICODE_ISUPPER(ch) != 0));
10369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010371 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010374
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 cased = 0;
10376 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 for (i = 0; i < length; i++) {
10378 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010379
Benjamin Peterson29060642009-01-31 22:14:21 +000010380 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10381 if (previous_is_cased)
10382 return PyBool_FromLong(0);
10383 previous_is_cased = 1;
10384 cased = 1;
10385 }
10386 else if (Py_UNICODE_ISLOWER(ch)) {
10387 if (!previous_is_cased)
10388 return PyBool_FromLong(0);
10389 previous_is_cased = 1;
10390 cased = 1;
10391 }
10392 else
10393 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010395 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396}
10397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010398PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010401Return True if all characters in S are whitespace\n\
10402and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403
10404static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010405unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 Py_ssize_t i, length;
10408 int kind;
10409 void *data;
10410
10411 if (PyUnicode_READY(self) == -1)
10412 return NULL;
10413 length = PyUnicode_GET_LENGTH(self);
10414 kind = PyUnicode_KIND(self);
10415 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (length == 1)
10419 return PyBool_FromLong(
10420 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010422 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 for (i = 0; i < length; i++) {
10427 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010428 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010431 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432}
10433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010434PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010436\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010437Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010438and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010439
10440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010441unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 Py_ssize_t i, length;
10444 int kind;
10445 void *data;
10446
10447 if (PyUnicode_READY(self) == -1)
10448 return NULL;
10449 length = PyUnicode_GET_LENGTH(self);
10450 kind = PyUnicode_KIND(self);
10451 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010452
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010453 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (length == 1)
10455 return PyBool_FromLong(
10456 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010457
10458 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010460 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 for (i = 0; i < length; i++) {
10463 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010465 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010466 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010467}
10468
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010469PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010470 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010471\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010472Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010473and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010474
10475static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010476unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 int kind;
10479 void *data;
10480 Py_ssize_t len, i;
10481
10482 if (PyUnicode_READY(self) == -1)
10483 return NULL;
10484
10485 kind = PyUnicode_KIND(self);
10486 data = PyUnicode_DATA(self);
10487 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010488
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010489 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (len == 1) {
10491 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10492 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10493 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010494
10495 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 for (i = 0; i < len; i++) {
10500 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010501 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010503 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010504 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010505}
10506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010507PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010508 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010510Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010511False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010512
10513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010514unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 Py_ssize_t i, length;
10517 int kind;
10518 void *data;
10519
10520 if (PyUnicode_READY(self) == -1)
10521 return NULL;
10522 length = PyUnicode_GET_LENGTH(self);
10523 kind = PyUnicode_KIND(self);
10524 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (length == 1)
10528 return PyBool_FromLong(
10529 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010531 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 for (i = 0; i < length; i++) {
10536 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010537 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010539 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540}
10541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010542PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010545Return True if all characters in S are digits\n\
10546and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547
10548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010549unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 Py_ssize_t i, length;
10552 int kind;
10553 void *data;
10554
10555 if (PyUnicode_READY(self) == -1)
10556 return NULL;
10557 length = PyUnicode_GET_LENGTH(self);
10558 kind = PyUnicode_KIND(self);
10559 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 if (length == 1) {
10563 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10564 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010567 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 for (i = 0; i < length; i++) {
10572 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010575 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576}
10577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010578PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010581Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010582False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583
10584static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010585unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 Py_ssize_t i, length;
10588 int kind;
10589 void *data;
10590
10591 if (PyUnicode_READY(self) == -1)
10592 return NULL;
10593 length = PyUnicode_GET_LENGTH(self);
10594 kind = PyUnicode_KIND(self);
10595 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 if (length == 1)
10599 return PyBool_FromLong(
10600 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010602 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010604 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 for (i = 0; i < length; i++) {
10607 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010608 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611}
10612
Martin v. Löwis47383402007-08-15 07:32:56 +000010613int
10614PyUnicode_IsIdentifier(PyObject *self)
10615{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 int kind;
10617 void *data;
10618 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010619 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 if (PyUnicode_READY(self) == -1) {
10622 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010623 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 }
10625
10626 /* Special case for empty strings */
10627 if (PyUnicode_GET_LENGTH(self) == 0)
10628 return 0;
10629 kind = PyUnicode_KIND(self);
10630 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010631
10632 /* PEP 3131 says that the first character must be in
10633 XID_Start and subsequent characters in XID_Continue,
10634 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010635 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010636 letters, digits, underscore). However, given the current
10637 definition of XID_Start and XID_Continue, it is sufficient
10638 to check just for these, except that _ must be allowed
10639 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010641 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010642 return 0;
10643
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010644 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010647 return 1;
10648}
10649
10650PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010652\n\
10653Return True if S is a valid identifier according\n\
10654to the language definition.");
10655
10656static PyObject*
10657unicode_isidentifier(PyObject *self)
10658{
10659 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10660}
10661
Georg Brandl559e5d72008-06-11 18:37:52 +000010662PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010663 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010664\n\
10665Return True if all characters in S are considered\n\
10666printable in repr() or S is empty, False otherwise.");
10667
10668static PyObject*
10669unicode_isprintable(PyObject *self)
10670{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 Py_ssize_t i, length;
10672 int kind;
10673 void *data;
10674
10675 if (PyUnicode_READY(self) == -1)
10676 return NULL;
10677 length = PyUnicode_GET_LENGTH(self);
10678 kind = PyUnicode_KIND(self);
10679 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010680
10681 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (length == 1)
10683 return PyBool_FromLong(
10684 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 for (i = 0; i < length; i++) {
10687 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010688 Py_RETURN_FALSE;
10689 }
10690 }
10691 Py_RETURN_TRUE;
10692}
10693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010694PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010695 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696\n\
10697Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010698iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699
10700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010701unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010703 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704}
10705
Martin v. Löwis18e16552006-02-15 17:27:45 +000010706static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707unicode_length(PyUnicodeObject *self)
10708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 if (PyUnicode_READY(self) == -1)
10710 return -1;
10711 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712}
10713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010714PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010717Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010718done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
10720static PyObject *
10721unicode_ljust(PyUnicodeObject *self, PyObject *args)
10722{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010723 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 Py_UCS4 fillchar = ' ';
10725
10726 if (PyUnicode_READY(self) == -1)
10727 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010728
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010729 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730 return NULL;
10731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733 Py_INCREF(self);
10734 return (PyObject*) self;
10735 }
10736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738}
10739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010740PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010743Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744
10745static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010746unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 return fixup(self, fixlower);
10749}
10750
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010751#define LEFTSTRIP 0
10752#define RIGHTSTRIP 1
10753#define BOTHSTRIP 2
10754
10755/* Arrays indexed by above */
10756static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10757
10758#define STRIPNAME(i) (stripformat[i]+3)
10759
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010760/* externally visible for str.strip(unicode) */
10761PyObject *
10762_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 void *data;
10765 int kind;
10766 Py_ssize_t i, j, len;
10767 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10770 return NULL;
10771
10772 kind = PyUnicode_KIND(self);
10773 data = PyUnicode_DATA(self);
10774 len = PyUnicode_GET_LENGTH(self);
10775 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10776 PyUnicode_DATA(sepobj),
10777 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010778
Benjamin Peterson14339b62009-01-31 16:36:08 +000010779 i = 0;
10780 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 while (i < len &&
10782 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 i++;
10784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010785 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010786
Benjamin Peterson14339b62009-01-31 16:36:08 +000010787 j = len;
10788 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 do {
10790 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 } while (j >= i &&
10792 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010795
Victor Stinner12bab6d2011-10-01 01:53:49 +020010796 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797}
10798
10799PyObject*
10800PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10801{
10802 unsigned char *data;
10803 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010804 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805
Victor Stinnerde636f32011-10-01 03:55:54 +020010806 if (PyUnicode_READY(self) == -1)
10807 return NULL;
10808
10809 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10810
Victor Stinner12bab6d2011-10-01 01:53:49 +020010811 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010813 if (PyUnicode_CheckExact(self)) {
10814 Py_INCREF(self);
10815 return self;
10816 }
10817 else
10818 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 }
10820
Victor Stinner12bab6d2011-10-01 01:53:49 +020010821 length = end - start;
10822 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010823 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824
Victor Stinnerde636f32011-10-01 03:55:54 +020010825 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010826 PyErr_SetString(PyExc_IndexError, "string index out of range");
10827 return NULL;
10828 }
10829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 kind = PyUnicode_KIND(self);
10831 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010832 return PyUnicode_FromKindAndData(kind,
10833 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010834 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010838do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 int kind;
10841 void *data;
10842 Py_ssize_t len, i, j;
10843
10844 if (PyUnicode_READY(self) == -1)
10845 return NULL;
10846
10847 kind = PyUnicode_KIND(self);
10848 data = PyUnicode_DATA(self);
10849 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010850
Benjamin Peterson14339b62009-01-31 16:36:08 +000010851 i = 0;
10852 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010854 i++;
10855 }
10856 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010857
Benjamin Peterson14339b62009-01-31 16:36:08 +000010858 j = len;
10859 if (striptype != LEFTSTRIP) {
10860 do {
10861 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010863 j++;
10864 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010865
Victor Stinner12bab6d2011-10-01 01:53:49 +020010866 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867}
10868
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010869
10870static PyObject *
10871do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10872{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010874
Benjamin Peterson14339b62009-01-31 16:36:08 +000010875 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10876 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010877
Benjamin Peterson14339b62009-01-31 16:36:08 +000010878 if (sep != NULL && sep != Py_None) {
10879 if (PyUnicode_Check(sep))
10880 return _PyUnicode_XStrip(self, striptype, sep);
10881 else {
10882 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 "%s arg must be None or str",
10884 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010885 return NULL;
10886 }
10887 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010888
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010890}
10891
10892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010893PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010894 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010895\n\
10896Return a copy of the string S with leading and trailing\n\
10897whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010898If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010899
10900static PyObject *
10901unicode_strip(PyUnicodeObject *self, PyObject *args)
10902{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010903 if (PyTuple_GET_SIZE(args) == 0)
10904 return do_strip(self, BOTHSTRIP); /* Common case */
10905 else
10906 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010907}
10908
10909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010910PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010912\n\
10913Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010914If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010915
10916static PyObject *
10917unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10918{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010919 if (PyTuple_GET_SIZE(args) == 0)
10920 return do_strip(self, LEFTSTRIP); /* Common case */
10921 else
10922 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010923}
10924
10925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010928\n\
10929Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010930If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010931
10932static PyObject *
10933unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10934{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010935 if (PyTuple_GET_SIZE(args) == 0)
10936 return do_strip(self, RIGHTSTRIP); /* Common case */
10937 else
10938 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010939}
10940
10941
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010943unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944{
10945 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
Georg Brandl222de0f2009-04-12 12:01:50 +000010948 if (len < 1) {
10949 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010950 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
Tim Peters7a29bd52001-09-12 03:03:31 +000010953 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 /* no repeat, return original string */
10955 Py_INCREF(str);
10956 return (PyObject*) str;
10957 }
Tim Peters8f422462000-09-09 06:13:41 +000010958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (PyUnicode_READY(str) == -1)
10960 return NULL;
10961
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010962 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010963 PyErr_SetString(PyExc_OverflowError,
10964 "repeated string is too long");
10965 return NULL;
10966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 if (!u)
10971 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010972 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 if (PyUnicode_GET_LENGTH(str) == 1) {
10975 const int kind = PyUnicode_KIND(str);
10976 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10977 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010978 if (kind == PyUnicode_1BYTE_KIND)
10979 memset(to, (unsigned char)fill_char, len);
10980 else {
10981 for (n = 0; n < len; ++n)
10982 PyUnicode_WRITE(kind, to, n, fill_char);
10983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 }
10985 else {
10986 /* number of characters copied this far */
10987 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10988 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10989 char *to = (char *) PyUnicode_DATA(u);
10990 Py_MEMCPY(to, PyUnicode_DATA(str),
10991 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 n = (done <= nchars-done) ? done : nchars-done;
10994 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010995 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 }
10998
10999 return (PyObject*) u;
11000}
11001
Alexander Belopolsky40018472011-02-26 01:02:56 +000011002PyObject *
11003PyUnicode_Replace(PyObject *obj,
11004 PyObject *subobj,
11005 PyObject *replobj,
11006 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007{
11008 PyObject *self;
11009 PyObject *str1;
11010 PyObject *str2;
11011 PyObject *result;
11012
11013 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011014 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011017 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 Py_DECREF(self);
11019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020 }
11021 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011022 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 Py_DECREF(self);
11024 Py_DECREF(str1);
11025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 Py_DECREF(self);
11029 Py_DECREF(str1);
11030 Py_DECREF(str2);
11031 return result;
11032}
11033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011034PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011035 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036\n\
11037Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011038old replaced by new. If the optional argument count is\n\
11039given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
11041static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 PyObject *str1;
11045 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011046 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 PyObject *result;
11048
Martin v. Löwis18e16552006-02-15 17:27:45 +000011049 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 str1 = PyUnicode_FromObject(str1);
11054 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11055 return NULL;
11056 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011057 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 Py_DECREF(str1);
11059 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061
11062 result = replace(self, str1, str2, maxcount);
11063
11064 Py_DECREF(str1);
11065 Py_DECREF(str2);
11066 return result;
11067}
11068
Alexander Belopolsky40018472011-02-26 01:02:56 +000011069static PyObject *
11070unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011072 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 Py_ssize_t isize;
11074 Py_ssize_t osize, squote, dquote, i, o;
11075 Py_UCS4 max, quote;
11076 int ikind, okind;
11077 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011080 return NULL;
11081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 isize = PyUnicode_GET_LENGTH(unicode);
11083 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 /* Compute length of output, quote characters, and
11086 maximum character */
11087 osize = 2; /* quotes */
11088 max = 127;
11089 squote = dquote = 0;
11090 ikind = PyUnicode_KIND(unicode);
11091 for (i = 0; i < isize; i++) {
11092 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11093 switch (ch) {
11094 case '\'': squote++; osize++; break;
11095 case '"': dquote++; osize++; break;
11096 case '\\': case '\t': case '\r': case '\n':
11097 osize += 2; break;
11098 default:
11099 /* Fast-path ASCII */
11100 if (ch < ' ' || ch == 0x7f)
11101 osize += 4; /* \xHH */
11102 else if (ch < 0x7f)
11103 osize++;
11104 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11105 osize++;
11106 max = ch > max ? ch : max;
11107 }
11108 else if (ch < 0x100)
11109 osize += 4; /* \xHH */
11110 else if (ch < 0x10000)
11111 osize += 6; /* \uHHHH */
11112 else
11113 osize += 10; /* \uHHHHHHHH */
11114 }
11115 }
11116
11117 quote = '\'';
11118 if (squote) {
11119 if (dquote)
11120 /* Both squote and dquote present. Use squote,
11121 and escape them */
11122 osize += squote;
11123 else
11124 quote = '"';
11125 }
11126
11127 repr = PyUnicode_New(osize, max);
11128 if (repr == NULL)
11129 return NULL;
11130 okind = PyUnicode_KIND(repr);
11131 odata = PyUnicode_DATA(repr);
11132
11133 PyUnicode_WRITE(okind, odata, 0, quote);
11134 PyUnicode_WRITE(okind, odata, osize-1, quote);
11135
11136 for (i = 0, o = 1; i < isize; i++) {
11137 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011138
11139 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 if ((ch == quote) || (ch == '\\')) {
11141 PyUnicode_WRITE(okind, odata, o++, '\\');
11142 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011143 continue;
11144 }
11145
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011147 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 PyUnicode_WRITE(okind, odata, o++, '\\');
11149 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011150 }
11151 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 PyUnicode_WRITE(okind, odata, o++, '\\');
11153 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011154 }
11155 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 PyUnicode_WRITE(okind, odata, o++, '\\');
11157 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011158 }
11159
11160 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011161 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 PyUnicode_WRITE(okind, odata, o++, '\\');
11163 PyUnicode_WRITE(okind, odata, o++, 'x');
11164 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11165 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011166 }
11167
Georg Brandl559e5d72008-06-11 18:37:52 +000011168 /* Copy ASCII characters as-is */
11169 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011171 }
11172
Benjamin Peterson29060642009-01-31 22:14:21 +000011173 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011174 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011175 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011176 (categories Z* and C* except ASCII space)
11177 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011179 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (ch <= 0xff) {
11181 PyUnicode_WRITE(okind, odata, o++, '\\');
11182 PyUnicode_WRITE(okind, odata, o++, 'x');
11183 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11184 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011185 }
11186 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 else if (ch >= 0x10000) {
11188 PyUnicode_WRITE(okind, odata, o++, '\\');
11189 PyUnicode_WRITE(okind, odata, o++, 'U');
11190 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11191 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11192 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11193 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11194 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11195 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11196 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11197 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011198 }
11199 /* Map 16-bit characters to '\uxxxx' */
11200 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 PyUnicode_WRITE(okind, odata, o++, '\\');
11202 PyUnicode_WRITE(okind, odata, o++, 'u');
11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11204 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11205 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11206 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011207 }
11208 }
11209 /* Copy characters as-is */
11210 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011212 }
11213 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011216 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217}
11218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011219PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221\n\
11222Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011223such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224arguments start and end are interpreted as in slice notation.\n\
11225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Jesus Ceaac451502011-04-20 17:09:23 +020011231 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011232 Py_ssize_t start;
11233 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011234 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Jesus Ceaac451502011-04-20 17:09:23 +020011236 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11237 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 if (PyUnicode_READY(self) == -1)
11241 return NULL;
11242 if (PyUnicode_READY(substring) == -1)
11243 return NULL;
11244
11245 result = any_find_slice(
11246 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11247 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011248 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
11250 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 if (result == -2)
11253 return NULL;
11254
Christian Heimes217cfd12007-12-02 14:31:20 +000011255 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256}
11257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Jesus Ceaac451502011-04-20 17:09:23 +020011266 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011267 Py_ssize_t start;
11268 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011269 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
Jesus Ceaac451502011-04-20 17:09:23 +020011271 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11272 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (PyUnicode_READY(self) == -1)
11276 return NULL;
11277 if (PyUnicode_READY(substring) == -1)
11278 return NULL;
11279
11280 result = any_find_slice(
11281 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11282 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011283 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
11285 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 if (result == -2)
11288 return NULL;
11289
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290 if (result < 0) {
11291 PyErr_SetString(PyExc_ValueError, "substring not found");
11292 return NULL;
11293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294
Christian Heimes217cfd12007-12-02 14:31:20 +000011295 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296}
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011301Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011302done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
11304static PyObject *
11305unicode_rjust(PyUnicodeObject *self, PyObject *args)
11306{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011307 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 Py_UCS4 fillchar = ' ';
11309
Victor Stinnere9a29352011-10-01 02:14:59 +020011310 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011312
Victor Stinnere9a29352011-10-01 02:14:59 +020011313 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 return NULL;
11315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 Py_INCREF(self);
11318 return (PyObject*) self;
11319 }
11320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322}
11323
Alexander Belopolsky40018472011-02-26 01:02:56 +000011324PyObject *
11325PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326{
11327 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011328
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 s = PyUnicode_FromObject(s);
11330 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011331 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 if (sep != NULL) {
11333 sep = PyUnicode_FromObject(sep);
11334 if (sep == NULL) {
11335 Py_DECREF(s);
11336 return NULL;
11337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338 }
11339
11340 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11341
11342 Py_DECREF(s);
11343 Py_XDECREF(sep);
11344 return result;
11345}
11346
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011347PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349\n\
11350Return a list of the words in S, using sep as the\n\
11351delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011352splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011353whitespace string is a separator and empty strings are\n\
11354removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
11356static PyObject*
11357unicode_split(PyUnicodeObject *self, PyObject *args)
11358{
11359 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011360 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
Martin v. Löwis18e16552006-02-15 17:27:45 +000011362 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363 return NULL;
11364
11365 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011368 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011370 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371}
11372
Thomas Wouters477c8d52006-05-27 19:21:47 +000011373PyObject *
11374PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11375{
11376 PyObject* str_obj;
11377 PyObject* sep_obj;
11378 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 int kind1, kind2, kind;
11380 void *buf1 = NULL, *buf2 = NULL;
11381 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011382
11383 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011384 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011386 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011388 Py_DECREF(str_obj);
11389 return NULL;
11390 }
11391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 kind1 = PyUnicode_KIND(str_in);
11393 kind2 = PyUnicode_KIND(sep_obj);
11394 kind = kind1 > kind2 ? kind1 : kind2;
11395 buf1 = PyUnicode_DATA(str_in);
11396 if (kind1 != kind)
11397 buf1 = _PyUnicode_AsKind(str_in, kind);
11398 if (!buf1)
11399 goto onError;
11400 buf2 = PyUnicode_DATA(sep_obj);
11401 if (kind2 != kind)
11402 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11403 if (!buf2)
11404 goto onError;
11405 len1 = PyUnicode_GET_LENGTH(str_obj);
11406 len2 = PyUnicode_GET_LENGTH(sep_obj);
11407
11408 switch(PyUnicode_KIND(str_in)) {
11409 case PyUnicode_1BYTE_KIND:
11410 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11411 break;
11412 case PyUnicode_2BYTE_KIND:
11413 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11414 break;
11415 case PyUnicode_4BYTE_KIND:
11416 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11417 break;
11418 default:
11419 assert(0);
11420 out = 0;
11421 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011422
11423 Py_DECREF(sep_obj);
11424 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 if (kind1 != kind)
11426 PyMem_Free(buf1);
11427 if (kind2 != kind)
11428 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429
11430 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 onError:
11432 Py_DECREF(sep_obj);
11433 Py_DECREF(str_obj);
11434 if (kind1 != kind && buf1)
11435 PyMem_Free(buf1);
11436 if (kind2 != kind && buf2)
11437 PyMem_Free(buf2);
11438 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011439}
11440
11441
11442PyObject *
11443PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11444{
11445 PyObject* str_obj;
11446 PyObject* sep_obj;
11447 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 int kind1, kind2, kind;
11449 void *buf1 = NULL, *buf2 = NULL;
11450 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011451
11452 str_obj = PyUnicode_FromObject(str_in);
11453 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011455 sep_obj = PyUnicode_FromObject(sep_in);
11456 if (!sep_obj) {
11457 Py_DECREF(str_obj);
11458 return NULL;
11459 }
11460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 kind1 = PyUnicode_KIND(str_in);
11462 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011463 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 buf1 = PyUnicode_DATA(str_in);
11465 if (kind1 != kind)
11466 buf1 = _PyUnicode_AsKind(str_in, kind);
11467 if (!buf1)
11468 goto onError;
11469 buf2 = PyUnicode_DATA(sep_obj);
11470 if (kind2 != kind)
11471 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11472 if (!buf2)
11473 goto onError;
11474 len1 = PyUnicode_GET_LENGTH(str_obj);
11475 len2 = PyUnicode_GET_LENGTH(sep_obj);
11476
11477 switch(PyUnicode_KIND(str_in)) {
11478 case PyUnicode_1BYTE_KIND:
11479 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11480 break;
11481 case PyUnicode_2BYTE_KIND:
11482 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11483 break;
11484 case PyUnicode_4BYTE_KIND:
11485 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11486 break;
11487 default:
11488 assert(0);
11489 out = 0;
11490 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011491
11492 Py_DECREF(sep_obj);
11493 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (kind1 != kind)
11495 PyMem_Free(buf1);
11496 if (kind2 != kind)
11497 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011498
11499 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 onError:
11501 Py_DECREF(sep_obj);
11502 Py_DECREF(str_obj);
11503 if (kind1 != kind && buf1)
11504 PyMem_Free(buf1);
11505 if (kind2 != kind && buf2)
11506 PyMem_Free(buf2);
11507 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011508}
11509
11510PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011512\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011513Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011514the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011515found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011516
11517static PyObject*
11518unicode_partition(PyUnicodeObject *self, PyObject *separator)
11519{
11520 return PyUnicode_Partition((PyObject *)self, separator);
11521}
11522
11523PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011524 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011525\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011526Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011528separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011529
11530static PyObject*
11531unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11532{
11533 return PyUnicode_RPartition((PyObject *)self, separator);
11534}
11535
Alexander Belopolsky40018472011-02-26 01:02:56 +000011536PyObject *
11537PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011538{
11539 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011540
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011541 s = PyUnicode_FromObject(s);
11542 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011543 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 if (sep != NULL) {
11545 sep = PyUnicode_FromObject(sep);
11546 if (sep == NULL) {
11547 Py_DECREF(s);
11548 return NULL;
11549 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011550 }
11551
11552 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11553
11554 Py_DECREF(s);
11555 Py_XDECREF(sep);
11556 return result;
11557}
11558
11559PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011561\n\
11562Return a list of the words in S, using sep as the\n\
11563delimiter string, starting at the end of the string and\n\
11564working to the front. If maxsplit is given, at most maxsplit\n\
11565splits are done. If sep is not specified, any whitespace string\n\
11566is a separator.");
11567
11568static PyObject*
11569unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11570{
11571 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011572 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011573
Martin v. Löwis18e16552006-02-15 17:27:45 +000011574 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011575 return NULL;
11576
11577 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011579 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011581 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011583}
11584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587\n\
11588Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011589Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011590is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591
11592static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011593unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011595 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011596 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011598 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11599 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600 return NULL;
11601
Guido van Rossum86662912000-04-11 15:38:46 +000011602 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
11605static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011606PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607{
Walter Dörwald346737f2007-05-31 10:44:43 +000011608 if (PyUnicode_CheckExact(self)) {
11609 Py_INCREF(self);
11610 return self;
11611 } else
11612 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011613 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618\n\
11619Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011620and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
11622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011623unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 return fixup(self, fixswapcase);
11626}
11627
Georg Brandlceee0772007-11-27 23:48:05 +000011628PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011630\n\
11631Return a translation table usable for str.translate().\n\
11632If there is only one argument, it must be a dictionary mapping Unicode\n\
11633ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011634Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011635If there are two arguments, they must be strings of equal length, and\n\
11636in the resulting dictionary, each character in x will be mapped to the\n\
11637character at the same position in y. If there is a third argument, it\n\
11638must be a string, whose characters will be mapped to None in the result.");
11639
11640static PyObject*
11641unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11642{
11643 PyObject *x, *y = NULL, *z = NULL;
11644 PyObject *new = NULL, *key, *value;
11645 Py_ssize_t i = 0;
11646 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011647
Georg Brandlceee0772007-11-27 23:48:05 +000011648 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11649 return NULL;
11650 new = PyDict_New();
11651 if (!new)
11652 return NULL;
11653 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 int x_kind, y_kind, z_kind;
11655 void *x_data, *y_data, *z_data;
11656
Georg Brandlceee0772007-11-27 23:48:05 +000011657 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011658 if (!PyUnicode_Check(x)) {
11659 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11660 "be a string if there is a second argument");
11661 goto err;
11662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011664 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11665 "arguments must have equal length");
11666 goto err;
11667 }
11668 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 x_kind = PyUnicode_KIND(x);
11670 y_kind = PyUnicode_KIND(y);
11671 x_data = PyUnicode_DATA(x);
11672 y_data = PyUnicode_DATA(y);
11673 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11674 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11675 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011676 if (!key || !value)
11677 goto err;
11678 res = PyDict_SetItem(new, key, value);
11679 Py_DECREF(key);
11680 Py_DECREF(value);
11681 if (res < 0)
11682 goto err;
11683 }
11684 /* create entries for deleting chars in z */
11685 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 z_kind = PyUnicode_KIND(z);
11687 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011688 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011690 if (!key)
11691 goto err;
11692 res = PyDict_SetItem(new, key, Py_None);
11693 Py_DECREF(key);
11694 if (res < 0)
11695 goto err;
11696 }
11697 }
11698 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 int kind;
11700 void *data;
11701
Georg Brandlceee0772007-11-27 23:48:05 +000011702 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011703 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011704 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11705 "to maketrans it must be a dict");
11706 goto err;
11707 }
11708 /* copy entries into the new dict, converting string keys to int keys */
11709 while (PyDict_Next(x, &i, &key, &value)) {
11710 if (PyUnicode_Check(key)) {
11711 /* convert string keys to integer keys */
11712 PyObject *newkey;
11713 if (PyUnicode_GET_SIZE(key) != 1) {
11714 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11715 "table must be of length 1");
11716 goto err;
11717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 kind = PyUnicode_KIND(key);
11719 data = PyUnicode_DATA(key);
11720 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011721 if (!newkey)
11722 goto err;
11723 res = PyDict_SetItem(new, newkey, value);
11724 Py_DECREF(newkey);
11725 if (res < 0)
11726 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011727 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011728 /* just keep integer keys */
11729 if (PyDict_SetItem(new, key, value) < 0)
11730 goto err;
11731 } else {
11732 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11733 "be strings or integers");
11734 goto err;
11735 }
11736 }
11737 }
11738 return new;
11739 err:
11740 Py_DECREF(new);
11741 return NULL;
11742}
11743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011744PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746\n\
11747Return a copy of the string S, where all characters have been mapped\n\
11748through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011749Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011750Unmapped characters are left untouched. Characters mapped to None\n\
11751are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752
11753static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757}
11758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011759PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011762Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011765unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 return fixup(self, fixupper);
11768}
11769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011771 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011773Pad a numeric string S with zeros on the left, to fill a field\n\
11774of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
11776static PyObject *
11777unicode_zfill(PyUnicodeObject *self, PyObject *args)
11778{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011779 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011781 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 int kind;
11783 void *data;
11784 Py_UCS4 chr;
11785
11786 if (PyUnicode_READY(self) == -1)
11787 return NULL;
11788
Martin v. Löwis18e16552006-02-15 17:27:45 +000011789 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 return NULL;
11791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011793 if (PyUnicode_CheckExact(self)) {
11794 Py_INCREF(self);
11795 return (PyObject*) self;
11796 }
11797 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011798 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 }
11800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802
11803 u = pad(self, fill, 0, '0');
11804
Walter Dörwald068325e2002-04-15 13:36:47 +000011805 if (u == NULL)
11806 return NULL;
11807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 kind = PyUnicode_KIND(u);
11809 data = PyUnicode_DATA(u);
11810 chr = PyUnicode_READ(kind, data, fill);
11811
11812 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 PyUnicode_WRITE(kind, data, 0, chr);
11815 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
11817
11818 return (PyObject*) u;
11819}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
11821#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011822static PyObject *
11823unicode__decimal2ascii(PyObject *self)
11824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011826}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827#endif
11828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011829PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011832Return True if S starts with the specified prefix, False otherwise.\n\
11833With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011834With optional end, stop comparing S at that position.\n\
11835prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
11837static PyObject *
11838unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011841 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011843 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011844 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011845 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846
Jesus Ceaac451502011-04-20 17:09:23 +020011847 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011849 if (PyTuple_Check(subobj)) {
11850 Py_ssize_t i;
11851 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11852 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011854 if (substring == NULL)
11855 return NULL;
11856 result = tailmatch(self, substring, start, end, -1);
11857 Py_DECREF(substring);
11858 if (result) {
11859 Py_RETURN_TRUE;
11860 }
11861 }
11862 /* nothing matched */
11863 Py_RETURN_FALSE;
11864 }
11865 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011866 if (substring == NULL) {
11867 if (PyErr_ExceptionMatches(PyExc_TypeError))
11868 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11869 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011871 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011872 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011874 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875}
11876
11877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011878PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011881Return True if S ends with the specified suffix, False otherwise.\n\
11882With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011883With optional end, stop comparing S at that position.\n\
11884suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject *
11887unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011890 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011892 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011893 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011894 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Jesus Ceaac451502011-04-20 17:09:23 +020011896 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011898 if (PyTuple_Check(subobj)) {
11899 Py_ssize_t i;
11900 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11901 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011903 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011905 result = tailmatch(self, substring, start, end, +1);
11906 Py_DECREF(substring);
11907 if (result) {
11908 Py_RETURN_TRUE;
11909 }
11910 }
11911 Py_RETURN_FALSE;
11912 }
11913 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011914 if (substring == NULL) {
11915 if (PyErr_ExceptionMatches(PyExc_TypeError))
11916 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11917 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011919 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011920 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011922 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011926
11927PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011929\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011930Return a formatted version of S, using substitutions from args and kwargs.\n\
11931The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011932
Eric Smith27bbca62010-11-04 17:06:58 +000011933PyDoc_STRVAR(format_map__doc__,
11934 "S.format_map(mapping) -> str\n\
11935\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011936Return a formatted version of S, using substitutions from mapping.\n\
11937The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011938
Eric Smith4a7d76d2008-05-30 18:10:19 +000011939static PyObject *
11940unicode__format__(PyObject* self, PyObject* args)
11941{
11942 PyObject *format_spec;
11943
11944 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11945 return NULL;
11946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11948 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011949}
11950
Eric Smith8c663262007-08-25 02:26:07 +000011951PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011953\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011954Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011955
11956static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011957unicode__sizeof__(PyUnicodeObject *v)
11958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 Py_ssize_t size;
11960
11961 /* If it's a compact object, account for base structure +
11962 character data. */
11963 if (PyUnicode_IS_COMPACT_ASCII(v))
11964 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11965 else if (PyUnicode_IS_COMPACT(v))
11966 size = sizeof(PyCompactUnicodeObject) +
11967 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11968 else {
11969 /* If it is a two-block object, account for base object, and
11970 for character block if present. */
11971 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011972 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 size += (PyUnicode_GET_LENGTH(v) + 1) *
11974 PyUnicode_CHARACTER_SIZE(v);
11975 }
11976 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011977 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011979 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011981 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011982 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983
11984 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011985}
11986
11987PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011989
11990static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011991unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011992{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011993 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 if (!copy)
11995 return NULL;
11996 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011997}
11998
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999static PyMethodDef unicode_methods[] = {
12000
12001 /* Order is according to common usage: often used methods should
12002 appear first, since lookup is done sequentially. */
12003
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012004 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012005 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12006 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012007 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012008 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12009 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12010 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12011 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12012 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12013 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12014 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012015 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012016 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12017 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12018 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012019 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012020 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12021 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12022 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012023 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012024 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012025 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012026 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012027 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12028 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12029 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12030 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12031 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12032 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12033 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12034 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12035 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12036 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12037 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12038 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12039 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12040 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012041 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012042 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012043 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012044 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012045 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012046 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012047 {"maketrans", (PyCFunction) unicode_maketrans,
12048 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012049 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012050#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012051 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052#endif
12053
12054#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012055 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012056 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057#endif
12058
Benjamin Peterson14339b62009-01-31 16:36:08 +000012059 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 {NULL, NULL}
12061};
12062
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012063static PyObject *
12064unicode_mod(PyObject *v, PyObject *w)
12065{
Brian Curtindfc80e32011-08-10 20:28:54 -050012066 if (!PyUnicode_Check(v))
12067 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012069}
12070
12071static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012072 0, /*nb_add*/
12073 0, /*nb_subtract*/
12074 0, /*nb_multiply*/
12075 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012076};
12077
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012079 (lenfunc) unicode_length, /* sq_length */
12080 PyUnicode_Concat, /* sq_concat */
12081 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12082 (ssizeargfunc) unicode_getitem, /* sq_item */
12083 0, /* sq_slice */
12084 0, /* sq_ass_item */
12085 0, /* sq_ass_slice */
12086 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087};
12088
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012089static PyObject*
12090unicode_subscript(PyUnicodeObject* self, PyObject* item)
12091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 if (PyUnicode_READY(self) == -1)
12093 return NULL;
12094
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012095 if (PyIndex_Check(item)) {
12096 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012097 if (i == -1 && PyErr_Occurred())
12098 return NULL;
12099 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012101 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012102 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012103 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012105 Py_UNICODE* result_buf;
12106 PyObject* result;
12107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012110 return NULL;
12111 }
12112
12113 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 return PyUnicode_New(0, 0);
12115 } else if (start == 0 && step == 1 &&
12116 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012117 PyUnicode_CheckExact(self)) {
12118 Py_INCREF(self);
12119 return (PyObject *)self;
12120 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012121 return PyUnicode_Substring((PyObject*)self,
12122 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012123 } else {
12124 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012125 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12126 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 if (result_buf == NULL)
12129 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012130
12131 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12132 result_buf[i] = source_buf[cur];
12133 }
Tim Petersced69f82003-09-16 20:30:58 +000012134
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012135 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012136 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012137 return result;
12138 }
12139 } else {
12140 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12141 return NULL;
12142 }
12143}
12144
12145static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 (lenfunc)unicode_length, /* mp_length */
12147 (binaryfunc)unicode_subscript, /* mp_subscript */
12148 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012149};
12150
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152/* Helpers for PyUnicode_Format() */
12153
12154static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012155getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012157 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 (*p_argidx)++;
12160 if (arglen < 0)
12161 return args;
12162 else
12163 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164 }
12165 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167 return NULL;
12168}
12169
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012170/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012172static PyObject *
12173formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012175 char *p;
12176 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012178
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179 x = PyFloat_AsDouble(v);
12180 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012181 return NULL;
12182
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012185
Eric Smith0923d1d2009-04-16 20:16:10 +000012186 p = PyOS_double_to_string(x, type, prec,
12187 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012188 if (p == NULL)
12189 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012191 PyMem_Free(p);
12192 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193}
12194
Tim Peters38fd5b62000-09-21 05:43:11 +000012195static PyObject*
12196formatlong(PyObject *val, int flags, int prec, int type)
12197{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012198 char *buf;
12199 int len;
12200 PyObject *str; /* temporary string object. */
12201 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012202
Benjamin Peterson14339b62009-01-31 16:36:08 +000012203 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12204 if (!str)
12205 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012207 Py_DECREF(str);
12208 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012209}
12210
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012213 size_t buflen,
12214 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012216 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012217 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 if (PyUnicode_GET_LENGTH(v) == 1) {
12219 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 buf[1] = '\0';
12221 return 1;
12222 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 goto onError;
12224 }
12225 else {
12226 /* Integer input truncated to a character */
12227 long x;
12228 x = PyLong_AsLong(v);
12229 if (x == -1 && PyErr_Occurred())
12230 goto onError;
12231
12232 if (x < 0 || x > 0x10ffff) {
12233 PyErr_SetString(PyExc_OverflowError,
12234 "%c arg not in range(0x110000)");
12235 return -1;
12236 }
12237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012239 buf[1] = '\0';
12240 return 1;
12241 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012242
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012244 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012246 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247}
12248
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012249/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012250 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012251*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012252#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012253
Alexander Belopolsky40018472011-02-26 01:02:56 +000012254PyObject *
12255PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 void *fmt;
12258 int fmtkind;
12259 PyObject *result;
12260 Py_UCS4 *res, *res0;
12261 Py_UCS4 max;
12262 int kind;
12263 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012267
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 PyErr_BadInternalCall();
12270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12273 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 fmt = PyUnicode_DATA(uformat);
12276 fmtkind = PyUnicode_KIND(uformat);
12277 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12278 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279
12280 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12282 if (res0 == NULL) {
12283 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
12287 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 arglen = PyTuple_Size(args);
12289 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 }
12291 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 arglen = -1;
12293 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012295 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012296 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298
12299 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 if (--rescnt < 0) {
12302 rescnt = fmtcnt + 100;
12303 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12305 if (res0 == NULL){
12306 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 }
12309 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012313 }
12314 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 /* Got a format specifier */
12316 int flags = 0;
12317 Py_ssize_t width = -1;
12318 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 Py_UCS4 c = '\0';
12320 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 int isnumok;
12322 PyObject *v = NULL;
12323 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 void *pbuf;
12325 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 Py_ssize_t len, len1;
12328 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 fmtpos++;
12331 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12332 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 Py_ssize_t keylen;
12334 PyObject *key;
12335 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012336
Benjamin Peterson29060642009-01-31 22:14:21 +000012337 if (dict == NULL) {
12338 PyErr_SetString(PyExc_TypeError,
12339 "format requires a mapping");
12340 goto onError;
12341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 /* Skip over balanced parentheses */
12346 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 if (fmtcnt < 0 || pcount > 0) {
12355 PyErr_SetString(PyExc_ValueError,
12356 "incomplete format key");
12357 goto onError;
12358 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012359 key = PyUnicode_Substring((PyObject*)uformat,
12360 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 if (key == NULL)
12362 goto onError;
12363 if (args_owned) {
12364 Py_DECREF(args);
12365 args_owned = 0;
12366 }
12367 args = PyObject_GetItem(dict, key);
12368 Py_DECREF(key);
12369 if (args == NULL) {
12370 goto onError;
12371 }
12372 args_owned = 1;
12373 arglen = -1;
12374 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012375 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 case '-': flags |= F_LJUST; continue;
12379 case '+': flags |= F_SIGN; continue;
12380 case ' ': flags |= F_BLANK; continue;
12381 case '#': flags |= F_ALT; continue;
12382 case '0': flags |= F_ZERO; continue;
12383 }
12384 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 if (c == '*') {
12387 v = getnextarg(args, arglen, &argidx);
12388 if (v == NULL)
12389 goto onError;
12390 if (!PyLong_Check(v)) {
12391 PyErr_SetString(PyExc_TypeError,
12392 "* wants int");
12393 goto onError;
12394 }
12395 width = PyLong_AsLong(v);
12396 if (width == -1 && PyErr_Occurred())
12397 goto onError;
12398 if (width < 0) {
12399 flags |= F_LJUST;
12400 width = -width;
12401 }
12402 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 }
12405 else if (c >= '0' && c <= '9') {
12406 width = c - '0';
12407 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 if (c < '0' || c > '9')
12410 break;
12411 if ((width*10) / 10 != width) {
12412 PyErr_SetString(PyExc_ValueError,
12413 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012414 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 }
12416 width = width*10 + (c - '0');
12417 }
12418 }
12419 if (c == '.') {
12420 prec = 0;
12421 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 if (c == '*') {
12424 v = getnextarg(args, arglen, &argidx);
12425 if (v == NULL)
12426 goto onError;
12427 if (!PyLong_Check(v)) {
12428 PyErr_SetString(PyExc_TypeError,
12429 "* wants int");
12430 goto onError;
12431 }
12432 prec = PyLong_AsLong(v);
12433 if (prec == -1 && PyErr_Occurred())
12434 goto onError;
12435 if (prec < 0)
12436 prec = 0;
12437 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 }
12440 else if (c >= '0' && c <= '9') {
12441 prec = c - '0';
12442 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 if (c < '0' || c > '9')
12445 break;
12446 if ((prec*10) / 10 != prec) {
12447 PyErr_SetString(PyExc_ValueError,
12448 "prec too big");
12449 goto onError;
12450 }
12451 prec = prec*10 + (c - '0');
12452 }
12453 }
12454 } /* prec */
12455 if (fmtcnt >= 0) {
12456 if (c == 'h' || c == 'l' || c == 'L') {
12457 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012459 }
12460 }
12461 if (fmtcnt < 0) {
12462 PyErr_SetString(PyExc_ValueError,
12463 "incomplete format");
12464 goto onError;
12465 }
12466 if (c != '%') {
12467 v = getnextarg(args, arglen, &argidx);
12468 if (v == NULL)
12469 goto onError;
12470 }
12471 sign = 0;
12472 fill = ' ';
12473 switch (c) {
12474
12475 case '%':
12476 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012480 len = 1;
12481 break;
12482
12483 case 's':
12484 case 'r':
12485 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012486 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 temp = v;
12488 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 }
12490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 if (c == 's')
12492 temp = PyObject_Str(v);
12493 else if (c == 'r')
12494 temp = PyObject_Repr(v);
12495 else
12496 temp = PyObject_ASCII(v);
12497 if (temp == NULL)
12498 goto onError;
12499 if (PyUnicode_Check(temp))
12500 /* nothing to do */;
12501 else {
12502 Py_DECREF(temp);
12503 PyErr_SetString(PyExc_TypeError,
12504 "%s argument has non-string str()");
12505 goto onError;
12506 }
12507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 if (PyUnicode_READY(temp) == -1) {
12509 Py_CLEAR(temp);
12510 goto onError;
12511 }
12512 pbuf = PyUnicode_DATA(temp);
12513 kind = PyUnicode_KIND(temp);
12514 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 if (prec >= 0 && len > prec)
12516 len = prec;
12517 break;
12518
12519 case 'i':
12520 case 'd':
12521 case 'u':
12522 case 'o':
12523 case 'x':
12524 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 isnumok = 0;
12526 if (PyNumber_Check(v)) {
12527 PyObject *iobj=NULL;
12528
12529 if (PyLong_Check(v)) {
12530 iobj = v;
12531 Py_INCREF(iobj);
12532 }
12533 else {
12534 iobj = PyNumber_Long(v);
12535 }
12536 if (iobj!=NULL) {
12537 if (PyLong_Check(iobj)) {
12538 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012539 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 Py_DECREF(iobj);
12541 if (!temp)
12542 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 if (PyUnicode_READY(temp) == -1) {
12544 Py_CLEAR(temp);
12545 goto onError;
12546 }
12547 pbuf = PyUnicode_DATA(temp);
12548 kind = PyUnicode_KIND(temp);
12549 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 sign = 1;
12551 }
12552 else {
12553 Py_DECREF(iobj);
12554 }
12555 }
12556 }
12557 if (!isnumok) {
12558 PyErr_Format(PyExc_TypeError,
12559 "%%%c format: a number is required, "
12560 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12561 goto onError;
12562 }
12563 if (flags & F_ZERO)
12564 fill = '0';
12565 break;
12566
12567 case 'e':
12568 case 'E':
12569 case 'f':
12570 case 'F':
12571 case 'g':
12572 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012573 temp = formatfloat(v, flags, prec, c);
12574 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 if (PyUnicode_READY(temp) == -1) {
12577 Py_CLEAR(temp);
12578 goto onError;
12579 }
12580 pbuf = PyUnicode_DATA(temp);
12581 kind = PyUnicode_KIND(temp);
12582 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 sign = 1;
12584 if (flags & F_ZERO)
12585 fill = '0';
12586 break;
12587
12588 case 'c':
12589 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012591 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 if (len < 0)
12593 goto onError;
12594 break;
12595
12596 default:
12597 PyErr_Format(PyExc_ValueError,
12598 "unsupported format character '%c' (0x%x) "
12599 "at index %zd",
12600 (31<=c && c<=126) ? (char)c : '?',
12601 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 goto onError;
12604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 /* pbuf is initialized here. */
12606 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12609 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12610 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 len--;
12612 }
12613 else if (flags & F_SIGN)
12614 sign = '+';
12615 else if (flags & F_BLANK)
12616 sign = ' ';
12617 else
12618 sign = 0;
12619 }
12620 if (width < len)
12621 width = len;
12622 if (rescnt - (sign != 0) < width) {
12623 reslen -= rescnt;
12624 rescnt = width + fmtcnt + 100;
12625 reslen += rescnt;
12626 if (reslen < 0) {
12627 Py_XDECREF(temp);
12628 PyErr_NoMemory();
12629 goto onError;
12630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12632 if (res0 == 0) {
12633 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 Py_XDECREF(temp);
12635 goto onError;
12636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 }
12639 if (sign) {
12640 if (fill != ' ')
12641 *res++ = sign;
12642 rescnt--;
12643 if (width > len)
12644 width--;
12645 }
12646 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12648 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12651 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 }
12653 rescnt -= 2;
12654 width -= 2;
12655 if (width < 0)
12656 width = 0;
12657 len -= 2;
12658 }
12659 if (width > len && !(flags & F_LJUST)) {
12660 do {
12661 --rescnt;
12662 *res++ = fill;
12663 } while (--width > len);
12664 }
12665 if (fill == ' ') {
12666 if (sign)
12667 *res++ = sign;
12668 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12670 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12671 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12672 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012673 }
12674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 /* Copy all characters, preserving len */
12676 len1 = len;
12677 while (len1--) {
12678 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12679 rescnt--;
12680 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 while (--width >= len) {
12682 --rescnt;
12683 *res++ = ' ';
12684 }
12685 if (dict && (argidx < arglen) && c != '%') {
12686 PyErr_SetString(PyExc_TypeError,
12687 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012688 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 goto onError;
12690 }
12691 Py_XDECREF(temp);
12692 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693 } /* until end */
12694 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 PyErr_SetString(PyExc_TypeError,
12696 "not all arguments converted during string formatting");
12697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698 }
12699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700
12701 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12702 if (*res > max)
12703 max = *res;
12704 result = PyUnicode_New(reslen - rescnt, max);
12705 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 kind = PyUnicode_KIND(result);
12708 for (res = res0; res < res0+reslen-rescnt; res++)
12709 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12710 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713 }
12714 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715 return (PyObject *)result;
12716
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719 Py_DECREF(uformat);
12720 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 }
12723 return NULL;
12724}
12725
Jeremy Hylton938ace62002-07-17 16:30:39 +000012726static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012727unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12728
Tim Peters6d6c1a32001-08-02 04:15:00 +000012729static PyObject *
12730unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12731{
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012733 static char *kwlist[] = {"object", "encoding", "errors", 0};
12734 char *encoding = NULL;
12735 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012736
Benjamin Peterson14339b62009-01-31 16:36:08 +000012737 if (type != &PyUnicode_Type)
12738 return unicode_subtype_new(type, args, kwds);
12739 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 return NULL;
12742 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 if (encoding == NULL && errors == NULL)
12745 return PyObject_Str(x);
12746 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012748}
12749
Guido van Rossume023fe02001-08-30 03:12:59 +000012750static PyObject *
12751unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12752{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012753 PyUnicodeObject *unicode, *self;
12754 Py_ssize_t length, char_size;
12755 int share_wstr, share_utf8;
12756 unsigned int kind;
12757 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012758
Benjamin Peterson14339b62009-01-31 16:36:08 +000012759 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012760
12761 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12762 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012764 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012765 if (PyUnicode_READY(unicode))
12766 return NULL;
12767
12768 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12769 if (self == NULL) {
12770 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 return NULL;
12772 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012773 kind = PyUnicode_KIND(unicode);
12774 length = PyUnicode_GET_LENGTH(unicode);
12775
12776 _PyUnicode_LENGTH(self) = length;
12777 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12778 _PyUnicode_STATE(self).interned = 0;
12779 _PyUnicode_STATE(self).kind = kind;
12780 _PyUnicode_STATE(self).compact = 0;
12781 _PyUnicode_STATE(self).ascii = 0;
12782 _PyUnicode_STATE(self).ready = 1;
12783 _PyUnicode_WSTR(self) = NULL;
12784 _PyUnicode_UTF8_LENGTH(self) = 0;
12785 _PyUnicode_UTF8(self) = NULL;
12786 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012787 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012788
12789 share_utf8 = 0;
12790 share_wstr = 0;
12791 if (kind == PyUnicode_1BYTE_KIND) {
12792 char_size = 1;
12793 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12794 share_utf8 = 1;
12795 }
12796 else if (kind == PyUnicode_2BYTE_KIND) {
12797 char_size = 2;
12798 if (sizeof(wchar_t) == 2)
12799 share_wstr = 1;
12800 }
12801 else {
12802 assert(kind == PyUnicode_4BYTE_KIND);
12803 char_size = 4;
12804 if (sizeof(wchar_t) == 4)
12805 share_wstr = 1;
12806 }
12807
12808 /* Ensure we won't overflow the length. */
12809 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12810 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012813 data = PyObject_MALLOC((length + 1) * char_size);
12814 if (data == NULL) {
12815 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 goto onError;
12817 }
12818
Victor Stinnerc3c74152011-10-02 20:39:55 +020012819 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012820 if (share_utf8) {
12821 _PyUnicode_UTF8_LENGTH(self) = length;
12822 _PyUnicode_UTF8(self) = data;
12823 }
12824 if (share_wstr) {
12825 _PyUnicode_WSTR_LENGTH(self) = length;
12826 _PyUnicode_WSTR(self) = (wchar_t *)data;
12827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012829 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12830 PyUnicode_KIND_SIZE(kind, length + 1));
12831 Py_DECREF(unicode);
12832 return (PyObject *)self;
12833
12834onError:
12835 Py_DECREF(unicode);
12836 Py_DECREF(self);
12837 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012838}
12839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012840PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012842\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012843Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012844encoding defaults to the current default string encoding.\n\
12845errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012846
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012847static PyObject *unicode_iter(PyObject *seq);
12848
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012850 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012851 "str", /* tp_name */
12852 sizeof(PyUnicodeObject), /* tp_size */
12853 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012855 (destructor)unicode_dealloc, /* tp_dealloc */
12856 0, /* tp_print */
12857 0, /* tp_getattr */
12858 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012859 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012860 unicode_repr, /* tp_repr */
12861 &unicode_as_number, /* tp_as_number */
12862 &unicode_as_sequence, /* tp_as_sequence */
12863 &unicode_as_mapping, /* tp_as_mapping */
12864 (hashfunc) unicode_hash, /* tp_hash*/
12865 0, /* tp_call*/
12866 (reprfunc) unicode_str, /* tp_str */
12867 PyObject_GenericGetAttr, /* tp_getattro */
12868 0, /* tp_setattro */
12869 0, /* tp_as_buffer */
12870 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012872 unicode_doc, /* tp_doc */
12873 0, /* tp_traverse */
12874 0, /* tp_clear */
12875 PyUnicode_RichCompare, /* tp_richcompare */
12876 0, /* tp_weaklistoffset */
12877 unicode_iter, /* tp_iter */
12878 0, /* tp_iternext */
12879 unicode_methods, /* tp_methods */
12880 0, /* tp_members */
12881 0, /* tp_getset */
12882 &PyBaseObject_Type, /* tp_base */
12883 0, /* tp_dict */
12884 0, /* tp_descr_get */
12885 0, /* tp_descr_set */
12886 0, /* tp_dictoffset */
12887 0, /* tp_init */
12888 0, /* tp_alloc */
12889 unicode_new, /* tp_new */
12890 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891};
12892
12893/* Initialize the Unicode implementation */
12894
Thomas Wouters78890102000-07-22 19:25:51 +000012895void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012897 int i;
12898
Thomas Wouters477c8d52006-05-27 19:21:47 +000012899 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901 0x000A, /* LINE FEED */
12902 0x000D, /* CARRIAGE RETURN */
12903 0x001C, /* FILE SEPARATOR */
12904 0x001D, /* GROUP SEPARATOR */
12905 0x001E, /* RECORD SEPARATOR */
12906 0x0085, /* NEXT LINE */
12907 0x2028, /* LINE SEPARATOR */
12908 0x2029, /* PARAGRAPH SEPARATOR */
12909 };
12910
Fred Drakee4315f52000-05-09 19:53:39 +000012911 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012912 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012913 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012915
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012916 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012917 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012918 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920
12921 /* initialize the linebreak bloom filter */
12922 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012924 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012925
12926 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927}
12928
12929/* Finalize the Unicode implementation */
12930
Christian Heimesa156e092008-02-16 07:38:31 +000012931int
12932PyUnicode_ClearFreeList(void)
12933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012935}
12936
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937void
Thomas Wouters78890102000-07-22 19:25:51 +000012938_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012940 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012942 Py_XDECREF(unicode_empty);
12943 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012944
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012945 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 if (unicode_latin1[i]) {
12947 Py_DECREF(unicode_latin1[i]);
12948 unicode_latin1[i] = NULL;
12949 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012950 }
Christian Heimesa156e092008-02-16 07:38:31 +000012951 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012953
Walter Dörwald16807132007-05-25 13:52:07 +000012954void
12955PyUnicode_InternInPlace(PyObject **p)
12956{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012957 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12958 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012959#ifdef Py_DEBUG
12960 assert(s != NULL);
12961 assert(_PyUnicode_CHECK(s));
12962#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012963 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012964 return;
12965#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012966 /* If it's a subclass, we don't really know what putting
12967 it in the interned dict might do. */
12968 if (!PyUnicode_CheckExact(s))
12969 return;
12970 if (PyUnicode_CHECK_INTERNED(s))
12971 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012973 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 return;
12975 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 if (interned == NULL) {
12977 interned = PyDict_New();
12978 if (interned == NULL) {
12979 PyErr_Clear(); /* Don't leave an exception */
12980 return;
12981 }
12982 }
12983 /* It might be that the GetItem call fails even
12984 though the key is present in the dictionary,
12985 namely when this happens during a stack overflow. */
12986 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012988 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012989
Benjamin Peterson29060642009-01-31 22:14:21 +000012990 if (t) {
12991 Py_INCREF(t);
12992 Py_DECREF(*p);
12993 *p = t;
12994 return;
12995 }
Walter Dörwald16807132007-05-25 13:52:07 +000012996
Benjamin Peterson14339b62009-01-31 16:36:08 +000012997 PyThreadState_GET()->recursion_critical = 1;
12998 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12999 PyErr_Clear();
13000 PyThreadState_GET()->recursion_critical = 0;
13001 return;
13002 }
13003 PyThreadState_GET()->recursion_critical = 0;
13004 /* The two references in interned are not counted by refcnt.
13005 The deallocator will take care of this */
13006 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013008}
13009
13010void
13011PyUnicode_InternImmortal(PyObject **p)
13012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13014
Benjamin Peterson14339b62009-01-31 16:36:08 +000013015 PyUnicode_InternInPlace(p);
13016 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013018 Py_INCREF(*p);
13019 }
Walter Dörwald16807132007-05-25 13:52:07 +000013020}
13021
13022PyObject *
13023PyUnicode_InternFromString(const char *cp)
13024{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013025 PyObject *s = PyUnicode_FromString(cp);
13026 if (s == NULL)
13027 return NULL;
13028 PyUnicode_InternInPlace(&s);
13029 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013030}
13031
Alexander Belopolsky40018472011-02-26 01:02:56 +000013032void
13033_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013034{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013035 PyObject *keys;
13036 PyUnicodeObject *s;
13037 Py_ssize_t i, n;
13038 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013039
Benjamin Peterson14339b62009-01-31 16:36:08 +000013040 if (interned == NULL || !PyDict_Check(interned))
13041 return;
13042 keys = PyDict_Keys(interned);
13043 if (keys == NULL || !PyList_Check(keys)) {
13044 PyErr_Clear();
13045 return;
13046 }
Walter Dörwald16807132007-05-25 13:52:07 +000013047
Benjamin Peterson14339b62009-01-31 16:36:08 +000013048 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13049 detector, interned unicode strings are not forcibly deallocated;
13050 rather, we give them their stolen references back, and then clear
13051 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013052
Benjamin Peterson14339b62009-01-31 16:36:08 +000013053 n = PyList_GET_SIZE(keys);
13054 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013056 for (i = 0; i < n; i++) {
13057 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 if (PyUnicode_READY(s) == -1)
13059 fprintf(stderr, "could not ready string\n");
13060 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 case SSTATE_NOT_INTERNED:
13062 /* XXX Shouldn't happen */
13063 break;
13064 case SSTATE_INTERNED_IMMORTAL:
13065 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013067 break;
13068 case SSTATE_INTERNED_MORTAL:
13069 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013071 break;
13072 default:
13073 Py_FatalError("Inconsistent interned string state.");
13074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013075 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013076 }
13077 fprintf(stderr, "total size of all interned strings: "
13078 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13079 "mortal/immortal\n", mortal_size, immortal_size);
13080 Py_DECREF(keys);
13081 PyDict_Clear(interned);
13082 Py_DECREF(interned);
13083 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013084}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013085
13086
13087/********************* Unicode Iterator **************************/
13088
13089typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013090 PyObject_HEAD
13091 Py_ssize_t it_index;
13092 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013093} unicodeiterobject;
13094
13095static void
13096unicodeiter_dealloc(unicodeiterobject *it)
13097{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013098 _PyObject_GC_UNTRACK(it);
13099 Py_XDECREF(it->it_seq);
13100 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013101}
13102
13103static int
13104unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13105{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013106 Py_VISIT(it->it_seq);
13107 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013108}
13109
13110static PyObject *
13111unicodeiter_next(unicodeiterobject *it)
13112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013113 PyUnicodeObject *seq;
13114 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013115
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 assert(it != NULL);
13117 seq = it->it_seq;
13118 if (seq == NULL)
13119 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013120 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13123 int kind = PyUnicode_KIND(seq);
13124 void *data = PyUnicode_DATA(seq);
13125 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13126 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013127 if (item != NULL)
13128 ++it->it_index;
13129 return item;
13130 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013131
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 Py_DECREF(seq);
13133 it->it_seq = NULL;
13134 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013135}
13136
13137static PyObject *
13138unicodeiter_len(unicodeiterobject *it)
13139{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 Py_ssize_t len = 0;
13141 if (it->it_seq)
13142 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13143 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013144}
13145
13146PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13147
13148static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013152};
13153
13154PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013155 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13156 "str_iterator", /* tp_name */
13157 sizeof(unicodeiterobject), /* tp_basicsize */
13158 0, /* tp_itemsize */
13159 /* methods */
13160 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13161 0, /* tp_print */
13162 0, /* tp_getattr */
13163 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013164 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013165 0, /* tp_repr */
13166 0, /* tp_as_number */
13167 0, /* tp_as_sequence */
13168 0, /* tp_as_mapping */
13169 0, /* tp_hash */
13170 0, /* tp_call */
13171 0, /* tp_str */
13172 PyObject_GenericGetAttr, /* tp_getattro */
13173 0, /* tp_setattro */
13174 0, /* tp_as_buffer */
13175 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13176 0, /* tp_doc */
13177 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13178 0, /* tp_clear */
13179 0, /* tp_richcompare */
13180 0, /* tp_weaklistoffset */
13181 PyObject_SelfIter, /* tp_iter */
13182 (iternextfunc)unicodeiter_next, /* tp_iternext */
13183 unicodeiter_methods, /* tp_methods */
13184 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013185};
13186
13187static PyObject *
13188unicode_iter(PyObject *seq)
13189{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013190 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013191
Benjamin Peterson14339b62009-01-31 16:36:08 +000013192 if (!PyUnicode_Check(seq)) {
13193 PyErr_BadInternalCall();
13194 return NULL;
13195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 if (PyUnicode_READY(seq) == -1)
13197 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13199 if (it == NULL)
13200 return NULL;
13201 it->it_index = 0;
13202 Py_INCREF(seq);
13203 it->it_seq = (PyUnicodeObject *)seq;
13204 _PyObject_GC_TRACK(it);
13205 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013206}
13207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208#define UNIOP(x) Py_UNICODE_##x
13209#define UNIOP_t Py_UNICODE
13210#include "uniops.h"
13211#undef UNIOP
13212#undef UNIOP_t
13213#define UNIOP(x) Py_UCS4_##x
13214#define UNIOP_t Py_UCS4
13215#include "uniops.h"
13216#undef UNIOP
13217#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013218
Victor Stinner71133ff2010-09-01 23:43:53 +000013219Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013220PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013221{
13222 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13223 Py_UNICODE *copy;
13224 Py_ssize_t size;
13225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013226 if (!PyUnicode_Check(unicode)) {
13227 PyErr_BadArgument();
13228 return NULL;
13229 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013230 /* Ensure we won't overflow the size. */
13231 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13232 PyErr_NoMemory();
13233 return NULL;
13234 }
13235 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13236 size *= sizeof(Py_UNICODE);
13237 copy = PyMem_Malloc(size);
13238 if (copy == NULL) {
13239 PyErr_NoMemory();
13240 return NULL;
13241 }
13242 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13243 return copy;
13244}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013245
Georg Brandl66c221e2010-10-14 07:04:07 +000013246/* A _string module, to export formatter_parser and formatter_field_name_split
13247 to the string.Formatter class implemented in Python. */
13248
13249static PyMethodDef _string_methods[] = {
13250 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13251 METH_O, PyDoc_STR("split the argument as a field name")},
13252 {"formatter_parser", (PyCFunction) formatter_parser,
13253 METH_O, PyDoc_STR("parse the argument as a format string")},
13254 {NULL, NULL}
13255};
13256
13257static struct PyModuleDef _string_module = {
13258 PyModuleDef_HEAD_INIT,
13259 "_string",
13260 PyDoc_STR("string helper module"),
13261 0,
13262 _string_methods,
13263 NULL,
13264 NULL,
13265 NULL,
13266 NULL
13267};
13268
13269PyMODINIT_FUNC
13270PyInit__string(void)
13271{
13272 return PyModule_Create(&_string_module);
13273}
13274
13275
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013276#ifdef __cplusplus
13277}
13278#endif