blob: 284809d3e54f8ffb6e079432e2beb0fd0a24d6c5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
446 int share_wstr;
447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
453
454 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
455 PyErr_NoMemory();
456 return -1;
457 }
458 new_size = (length + 1) * char_size;
459
460 data = (PyObject *)PyObject_REALLOC(data, new_size);
461 if (data == NULL) {
462 PyErr_NoMemory();
463 return -1;
464 }
465 _PyUnicode_DATA_ANY(unicode) = data;
466 if (share_wstr)
467 _PyUnicode_WSTR(unicode) = data;
468 _PyUnicode_LENGTH(unicode) = length;
469 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
470 if (share_wstr)
471 return 0;
472 }
473 if (_PyUnicode_WSTR(unicode) != NULL) {
474 assert(_PyUnicode_WSTR(unicode) != NULL);
475
476 oldstr = _PyUnicode_WSTR(unicode);
477 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
478 sizeof(Py_UNICODE) * (length + 1));
479 if (!_PyUnicode_WSTR(unicode)) {
480 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
481 PyErr_NoMemory();
482 return -1;
483 }
484 _PyUnicode_WSTR(unicode)[length] = 0;
485 _PyUnicode_WSTR_LENGTH(unicode) = length;
486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487 return 0;
488}
489
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490static PyObject*
491resize_copy(PyObject *unicode, Py_ssize_t length)
492{
493 Py_ssize_t copy_length;
494 if (PyUnicode_IS_COMPACT(unicode)) {
495 PyObject *copy;
496 assert(PyUnicode_IS_READY(unicode));
497
498 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
499 if (copy == NULL)
500 return NULL;
501
502 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
503 if (PyUnicode_CopyCharacters(copy, 0,
504 unicode, 0,
505 copy_length) < 0)
506 {
507 Py_DECREF(copy);
508 return NULL;
509 }
510 return copy;
511 } else {
512 assert(_PyUnicode_WSTR(unicode) != NULL);
513 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
514 PyUnicodeObject *w = _PyUnicode_New(length);
515 if (w == NULL)
516 return NULL;
517 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
518 copy_length = Py_MIN(copy_length, length);
519 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
520 copy_length);
521 return (PyObject*)w;
522 }
523}
524
Guido van Rossumd57fd912000-03-10 22:53:23 +0000525/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000526 Ux0000 terminated; some code (e.g. new_identifier)
527 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528
529 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531
532*/
533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534#ifdef Py_DEBUG
535int unicode_old_new_calls = 0;
536#endif
537
Alexander Belopolsky40018472011-02-26 01:02:56 +0000538static PyUnicodeObject *
539_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540{
541 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200542 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000545 if (length == 0 && unicode_empty != NULL) {
546 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200547 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548 }
549
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000550 /* Ensure we won't overflow the size. */
551 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
552 return (PyUnicodeObject *)PyErr_NoMemory();
553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200554 if (length < 0) {
555 PyErr_SetString(PyExc_SystemError,
556 "Negative size passed to _PyUnicode_New");
557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558 }
559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560#ifdef Py_DEBUG
561 ++unicode_old_new_calls;
562#endif
563
564 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
565 if (unicode == NULL)
566 return NULL;
567 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
568 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
569 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000570 PyErr_NoMemory();
571 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200573
Jeremy Hyltond8082792003-09-16 19:41:39 +0000574 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000575 * the caller fails before initializing str -- unicode_resize()
576 * reads str[0], and the Keep-Alive optimization can keep memory
577 * allocated for str alive across a call to unicode_dealloc(unicode).
578 * We don't want unicode_resize to read uninitialized memory in
579 * that case.
580 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200581 _PyUnicode_WSTR(unicode)[0] = 0;
582 _PyUnicode_WSTR(unicode)[length] = 0;
583 _PyUnicode_WSTR_LENGTH(unicode) = length;
584 _PyUnicode_HASH(unicode) = -1;
585 _PyUnicode_STATE(unicode).interned = 0;
586 _PyUnicode_STATE(unicode).kind = 0;
587 _PyUnicode_STATE(unicode).compact = 0;
588 _PyUnicode_STATE(unicode).ready = 0;
589 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200590 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200592 _PyUnicode_UTF8(unicode) = NULL;
593 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000595
Benjamin Peterson29060642009-01-31 22:14:21 +0000596 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000597 /* XXX UNREF/NEWREF interface should be more symmetrical */
598 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000599 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000600 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602}
603
Victor Stinnerf42dc442011-10-02 23:33:16 +0200604static const char*
605unicode_kind_name(PyObject *unicode)
606{
Victor Stinner910337b2011-10-03 03:20:16 +0200607 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200608 if (!PyUnicode_IS_COMPACT(unicode))
609 {
610 if (!PyUnicode_IS_READY(unicode))
611 return "wstr";
612 switch(PyUnicode_KIND(unicode))
613 {
614 case PyUnicode_1BYTE_KIND:
615 if (PyUnicode_IS_COMPACT_ASCII(unicode))
616 return "legacy ascii";
617 else
618 return "legacy latin1";
619 case PyUnicode_2BYTE_KIND:
620 return "legacy UCS2";
621 case PyUnicode_4BYTE_KIND:
622 return "legacy UCS4";
623 default:
624 return "<legacy invalid kind>";
625 }
626 }
627 assert(PyUnicode_IS_READY(unicode));
628 switch(PyUnicode_KIND(unicode))
629 {
630 case PyUnicode_1BYTE_KIND:
631 if (PyUnicode_IS_COMPACT_ASCII(unicode))
632 return "ascii";
633 else
634 return "compact latin1";
635 case PyUnicode_2BYTE_KIND:
636 return "compact UCS2";
637 case PyUnicode_4BYTE_KIND:
638 return "compact UCS4";
639 default:
640 return "<invalid compact kind>";
641 }
642}
643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644#ifdef Py_DEBUG
645int unicode_new_new_calls = 0;
646
647/* Functions wrapping macros for use in debugger */
648char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200649 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200650}
651
652void *_PyUnicode_compact_data(void *unicode) {
653 return _PyUnicode_COMPACT_DATA(unicode);
654}
655void *_PyUnicode_data(void *unicode){
656 printf("obj %p\n", unicode);
657 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
658 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
659 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
660 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
661 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
662 return PyUnicode_DATA(unicode);
663}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200664
665void
666_PyUnicode_Dump(PyObject *op)
667{
668 PyASCIIObject *ascii = (PyASCIIObject *)op;
669 printf("%s: len=%zu, wstr=%p",
670 unicode_kind_name(op),
671 ascii->length,
672 ascii->wstr);
673 if (!ascii->state.ascii) {
674 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
675 printf(" (%zu), utf8=%p (%zu)",
676 compact->wstr_length,
677 compact->utf8,
678 compact->utf8_length);
679 }
680 if (!ascii->state.compact) {
681 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
682 printf(", data=%p",
683 unicode->data.any);
684 }
685 printf("\n");
686}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687#endif
688
689PyObject *
690PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
691{
692 PyObject *obj;
693 PyCompactUnicodeObject *unicode;
694 void *data;
695 int kind_state;
696 int is_sharing = 0, is_ascii = 0;
697 Py_ssize_t char_size;
698 Py_ssize_t struct_size;
699
700 /* Optimization for empty strings */
701 if (size == 0 && unicode_empty != NULL) {
702 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200703 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 }
705
706#ifdef Py_DEBUG
707 ++unicode_new_new_calls;
708#endif
709
710 struct_size = sizeof(PyCompactUnicodeObject);
711 if (maxchar < 128) {
712 kind_state = PyUnicode_1BYTE_KIND;
713 char_size = 1;
714 is_ascii = 1;
715 struct_size = sizeof(PyASCIIObject);
716 }
717 else if (maxchar < 256) {
718 kind_state = PyUnicode_1BYTE_KIND;
719 char_size = 1;
720 }
721 else if (maxchar < 65536) {
722 kind_state = PyUnicode_2BYTE_KIND;
723 char_size = 2;
724 if (sizeof(wchar_t) == 2)
725 is_sharing = 1;
726 }
727 else {
728 kind_state = PyUnicode_4BYTE_KIND;
729 char_size = 4;
730 if (sizeof(wchar_t) == 4)
731 is_sharing = 1;
732 }
733
734 /* Ensure we won't overflow the size. */
735 if (size < 0) {
736 PyErr_SetString(PyExc_SystemError,
737 "Negative size passed to PyUnicode_New");
738 return NULL;
739 }
740 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
741 return PyErr_NoMemory();
742
743 /* Duplicated allocation code from _PyObject_New() instead of a call to
744 * PyObject_New() so we are able to allocate space for the object and
745 * it's data buffer.
746 */
747 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
748 if (obj == NULL)
749 return PyErr_NoMemory();
750 obj = PyObject_INIT(obj, &PyUnicode_Type);
751 if (obj == NULL)
752 return NULL;
753
754 unicode = (PyCompactUnicodeObject *)obj;
755 if (is_ascii)
756 data = ((PyASCIIObject*)obj) + 1;
757 else
758 data = unicode + 1;
759 _PyUnicode_LENGTH(unicode) = size;
760 _PyUnicode_HASH(unicode) = -1;
761 _PyUnicode_STATE(unicode).interned = 0;
762 _PyUnicode_STATE(unicode).kind = kind_state;
763 _PyUnicode_STATE(unicode).compact = 1;
764 _PyUnicode_STATE(unicode).ready = 1;
765 _PyUnicode_STATE(unicode).ascii = is_ascii;
766 if (is_ascii) {
767 ((char*)data)[size] = 0;
768 _PyUnicode_WSTR(unicode) = NULL;
769 }
770 else if (kind_state == PyUnicode_1BYTE_KIND) {
771 ((char*)data)[size] = 0;
772 _PyUnicode_WSTR(unicode) = NULL;
773 _PyUnicode_WSTR_LENGTH(unicode) = 0;
774 unicode->utf8_length = 0;
775 unicode->utf8 = NULL;
776 }
777 else {
778 unicode->utf8 = NULL;
779 if (kind_state == PyUnicode_2BYTE_KIND)
780 ((Py_UCS2*)data)[size] = 0;
781 else /* kind_state == PyUnicode_4BYTE_KIND */
782 ((Py_UCS4*)data)[size] = 0;
783 if (is_sharing) {
784 _PyUnicode_WSTR_LENGTH(unicode) = size;
785 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
786 }
787 else {
788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
789 _PyUnicode_WSTR(unicode) = NULL;
790 }
791 }
792 return obj;
793}
794
795#if SIZEOF_WCHAR_T == 2
796/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
797 will decode surrogate pairs, the other conversions are implemented as macros
798 for efficency.
799
800 This function assumes that unicode can hold one more code point than wstr
801 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200802static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
804 PyUnicodeObject *unicode)
805{
806 const wchar_t *iter;
807 Py_UCS4 *ucs4_out;
808
Victor Stinner910337b2011-10-03 03:20:16 +0200809 assert(unicode != NULL);
810 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200811 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
812 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
813
814 for (iter = begin; iter < end; ) {
815 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
816 _PyUnicode_GET_LENGTH(unicode)));
817 if (*iter >= 0xD800 && *iter <= 0xDBFF
818 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
819 {
820 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
821 iter += 2;
822 }
823 else {
824 *ucs4_out++ = *iter;
825 iter++;
826 }
827 }
828 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
829 _PyUnicode_GET_LENGTH(unicode)));
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831}
832#endif
833
Victor Stinnercd9950f2011-10-02 00:34:53 +0200834static int
835_PyUnicode_Dirty(PyObject *unicode)
836{
Victor Stinner910337b2011-10-03 03:20:16 +0200837 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200838 if (Py_REFCNT(unicode) != 1) {
839 PyErr_SetString(PyExc_ValueError,
840 "Cannot modify a string having more than 1 reference");
841 return -1;
842 }
843 _PyUnicode_DIRTY(unicode);
844 return 0;
845}
846
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200847Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
849 PyObject *from, Py_ssize_t from_start,
850 Py_ssize_t how_many)
851{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200852 unsigned int from_kind, to_kind;
853 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854
Victor Stinnerb1536152011-09-30 02:26:10 +0200855 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
856 PyErr_BadInternalCall();
857 return -1;
858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859
860 if (PyUnicode_READY(from))
861 return -1;
862 if (PyUnicode_READY(to))
863 return -1;
864
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200865 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200866 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
867 PyErr_Format(PyExc_ValueError,
868 "Cannot write %zi characters at %zi "
869 "in a string of %zi characters",
870 how_many, to_start, PyUnicode_GET_LENGTH(to));
871 return -1;
872 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200873 if (how_many == 0)
874 return 0;
875
Victor Stinnercd9950f2011-10-02 00:34:53 +0200876 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200877 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200880 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200882 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883
Victor Stinnerf42dc442011-10-02 23:33:16 +0200884 if (from_kind == to_kind
885 /* deny latin1 => ascii */
886 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
887 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200888 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200889 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200890 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200891 + PyUnicode_KIND_SIZE(from_kind, from_start),
892 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200894 else if (from_kind == PyUnicode_1BYTE_KIND
895 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200896 {
897 _PyUnicode_CONVERT_BYTES(
898 Py_UCS1, Py_UCS2,
899 PyUnicode_1BYTE_DATA(from) + from_start,
900 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
901 PyUnicode_2BYTE_DATA(to) + to_start
902 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200903 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200904 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200905 && to_kind == PyUnicode_4BYTE_KIND)
906 {
907 _PyUnicode_CONVERT_BYTES(
908 Py_UCS1, Py_UCS4,
909 PyUnicode_1BYTE_DATA(from) + from_start,
910 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
911 PyUnicode_4BYTE_DATA(to) + to_start
912 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200913 }
914 else if (from_kind == PyUnicode_2BYTE_KIND
915 && to_kind == PyUnicode_4BYTE_KIND)
916 {
917 _PyUnicode_CONVERT_BYTES(
918 Py_UCS2, Py_UCS4,
919 PyUnicode_2BYTE_DATA(from) + from_start,
920 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
921 PyUnicode_4BYTE_DATA(to) + to_start
922 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200923 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200924 else {
925 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200926
927 /* check if max_char(from substring) <= max_char(to) */
928 if (from_kind > to_kind
929 /* latin1 => ascii */
930 || (PyUnicode_IS_COMPACT_ASCII(to)
931 && to_kind == PyUnicode_1BYTE_KIND
932 && !PyUnicode_IS_COMPACT_ASCII(from)))
933 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200934 /* slow path to check for character overflow */
935 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
936 Py_UCS4 ch, maxchar;
937 Py_ssize_t i;
938
939 maxchar = 0;
940 invalid_kinds = 0;
941 for (i=0; i < how_many; i++) {
942 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
943 if (ch > maxchar) {
944 maxchar = ch;
945 if (maxchar > to_maxchar) {
946 invalid_kinds = 1;
947 break;
948 }
949 }
950 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
951 }
952 }
953 else
954 invalid_kinds = 1;
955 if (invalid_kinds) {
956 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200957 "Cannot copy %s characters "
958 "into a string of %s characters",
959 unicode_kind_name(from),
960 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 return -1;
962 }
963 }
964 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965}
966
Victor Stinner17222162011-09-28 22:15:37 +0200967/* Find the maximum code point and count the number of surrogate pairs so a
968 correct string length can be computed before converting a string to UCS4.
969 This function counts single surrogates as a character and not as a pair.
970
971 Return 0 on success, or -1 on error. */
972static int
973find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
974 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975{
976 const wchar_t *iter;
977
Victor Stinnerc53be962011-10-02 21:33:54 +0200978 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 if (num_surrogates == NULL || maxchar == NULL) {
980 PyErr_SetString(PyExc_SystemError,
981 "unexpected NULL arguments to "
982 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
983 return -1;
984 }
985
986 *num_surrogates = 0;
987 *maxchar = 0;
988
989 for (iter = begin; iter < end; ) {
990 if (*iter > *maxchar)
991 *maxchar = *iter;
992#if SIZEOF_WCHAR_T == 2
993 if (*iter >= 0xD800 && *iter <= 0xDBFF
994 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
995 {
996 Py_UCS4 surrogate_val;
997 surrogate_val = (((iter[0] & 0x3FF)<<10)
998 | (iter[1] & 0x3FF)) + 0x10000;
999 ++(*num_surrogates);
1000 if (surrogate_val > *maxchar)
1001 *maxchar = surrogate_val;
1002 iter += 2;
1003 }
1004 else
1005 iter++;
1006#else
1007 iter++;
1008#endif
1009 }
1010 return 0;
1011}
1012
1013#ifdef Py_DEBUG
1014int unicode_ready_calls = 0;
1015#endif
1016
1017int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001018_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001020 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 wchar_t *end;
1022 Py_UCS4 maxchar = 0;
1023 Py_ssize_t num_surrogates;
1024#if SIZEOF_WCHAR_T == 2
1025 Py_ssize_t length_wo_surrogates;
1026#endif
1027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001029 strings were created using _PyObject_New() and where no canonical
1030 representation (the str field) has been set yet aka strings
1031 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001032 assert(_PyUnicode_CHECK(unicode));
1033 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001035 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001036 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001037 /* Actually, it should neither be interned nor be anything else: */
1038 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039
1040#ifdef Py_DEBUG
1041 ++unicode_ready_calls;
1042#endif
1043
1044 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001045 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001046 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048
1049 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001050 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1051 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 PyErr_NoMemory();
1053 return -1;
1054 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001055 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 _PyUnicode_WSTR(unicode), end,
1057 PyUnicode_1BYTE_DATA(unicode));
1058 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1059 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1060 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1061 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001062 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001063 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 }
1065 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001066 _PyUnicode_UTF8(unicode) = NULL;
1067 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 }
1069 PyObject_FREE(_PyUnicode_WSTR(unicode));
1070 _PyUnicode_WSTR(unicode) = NULL;
1071 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1072 }
1073 /* In this case we might have to convert down from 4-byte native
1074 wchar_t to 2-byte unicode. */
1075 else if (maxchar < 65536) {
1076 assert(num_surrogates == 0 &&
1077 "FindMaxCharAndNumSurrogatePairs() messed up");
1078
Victor Stinner506f5922011-09-28 22:34:18 +02001079#if SIZEOF_WCHAR_T == 2
1080 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001081 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001082 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1083 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1084 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001085 _PyUnicode_UTF8(unicode) = NULL;
1086 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001087#else
1088 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001089 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001090 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001091 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001092 PyErr_NoMemory();
1093 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 }
Victor Stinner506f5922011-09-28 22:34:18 +02001095 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1096 _PyUnicode_WSTR(unicode), end,
1097 PyUnicode_2BYTE_DATA(unicode));
1098 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1099 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1100 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001101 _PyUnicode_UTF8(unicode) = NULL;
1102 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001103 PyObject_FREE(_PyUnicode_WSTR(unicode));
1104 _PyUnicode_WSTR(unicode) = NULL;
1105 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1106#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 }
1108 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1109 else {
1110#if SIZEOF_WCHAR_T == 2
1111 /* in case the native representation is 2-bytes, we need to allocate a
1112 new normalized 4-byte version. */
1113 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001114 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1115 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 PyErr_NoMemory();
1117 return -1;
1118 }
1119 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1120 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001121 _PyUnicode_UTF8(unicode) = NULL;
1122 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinnerc53be962011-10-02 21:33:54 +02001123 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 PyObject_FREE(_PyUnicode_WSTR(unicode));
1125 _PyUnicode_WSTR(unicode) = NULL;
1126 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1127#else
1128 assert(num_surrogates == 0);
1129
Victor Stinnerc3c74152011-10-02 20:39:55 +02001130 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001132 _PyUnicode_UTF8(unicode) = NULL;
1133 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1135#endif
1136 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1137 }
1138 _PyUnicode_STATE(unicode).ready = 1;
1139 return 0;
1140}
1141
Alexander Belopolsky40018472011-02-26 01:02:56 +00001142static void
1143unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144{
Walter Dörwald16807132007-05-25 13:52:07 +00001145 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001146 case SSTATE_NOT_INTERNED:
1147 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001148
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 case SSTATE_INTERNED_MORTAL:
1150 /* revive dead object temporarily for DelItem */
1151 Py_REFCNT(unicode) = 3;
1152 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1153 Py_FatalError(
1154 "deletion of interned string failed");
1155 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001156
Benjamin Peterson29060642009-01-31 22:14:21 +00001157 case SSTATE_INTERNED_IMMORTAL:
1158 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001159
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 default:
1161 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001162 }
1163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 if (_PyUnicode_WSTR(unicode) &&
1165 (!PyUnicode_IS_READY(unicode) ||
1166 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1167 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001168 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001169 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
1171 if (PyUnicode_IS_COMPACT(unicode)) {
1172 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 }
1174 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001175 if (_PyUnicode_DATA_ANY(unicode))
1176 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001177 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 }
1179}
1180
Alexander Belopolsky40018472011-02-26 01:02:56 +00001181static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001182unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001183{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 if (Py_REFCNT(unicode) != 1)
1185 return 0;
1186 if (PyUnicode_CHECK_INTERNED(unicode))
1187 return 0;
1188 if (unicode == unicode_empty)
1189 return 0;
1190 if (PyUnicode_WSTR_LENGTH(unicode) == 1) {
1191 Py_UCS4 ch;
1192 if (PyUnicode_IS_COMPACT(unicode))
1193 ch = PyUnicode_READ_CHAR(unicode, 0);
1194 else
1195 ch = _PyUnicode_WSTR(unicode)[0];
1196 if (ch < 256 && unicode_latin1[ch] == unicode)
1197 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001198 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001199 /* FIXME: reenable resize_inplace */
1200 if (!PyUnicode_IS_COMPACT(unicode))
1201 return 0;
1202 return 1;
1203}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001204
Victor Stinnerfe226c02011-10-03 03:52:20 +02001205static int
1206unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1207{
1208 PyObject *unicode;
1209 Py_ssize_t old_length;
1210
1211 assert(p_unicode != NULL);
1212 unicode = *p_unicode;
1213
1214 assert(unicode != NULL);
1215 assert(PyUnicode_Check(unicode));
1216 assert(0 <= length);
1217
Victor Stinner910337b2011-10-03 03:20:16 +02001218 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001219 old_length = PyUnicode_WSTR_LENGTH(unicode);
1220 else
1221 old_length = PyUnicode_GET_LENGTH(unicode);
1222 if (old_length == length)
1223 return 0;
1224
1225 /* FIXME: really create a new object? */
1226 if (!unicode_resizable(unicode)) {
1227 PyObject *copy = resize_copy(unicode, length);
1228 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001229 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001230 Py_DECREF(*p_unicode);
1231 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001232 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001233 }
1234
Victor Stinnerfe226c02011-10-03 03:52:20 +02001235 if (PyUnicode_IS_COMPACT(unicode)) {
1236 *p_unicode = resize_compact(unicode, length);
1237 if (*p_unicode == NULL)
1238 return -1;
1239 return 0;
1240 } else
1241 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001242}
1243
Alexander Belopolsky40018472011-02-26 01:02:56 +00001244int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001245PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001246{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001247 PyObject *unicode;
1248 if (p_unicode == NULL) {
1249 PyErr_BadInternalCall();
1250 return -1;
1251 }
1252 unicode = *p_unicode;
1253 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1254 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1255 {
1256 PyErr_BadInternalCall();
1257 return -1;
1258 }
1259 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001260}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262static PyObject*
1263get_latin1_char(unsigned char ch)
1264{
Victor Stinnera464fc12011-10-02 20:39:30 +02001265 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001267 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 if (!unicode)
1269 return NULL;
1270 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1271 unicode_latin1[ch] = unicode;
1272 }
1273 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001274 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275}
1276
Alexander Belopolsky40018472011-02-26 01:02:56 +00001277PyObject *
1278PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279{
1280 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281 Py_UCS4 maxchar = 0;
1282 Py_ssize_t num_surrogates;
1283
1284 if (u == NULL)
1285 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001287 /* If the Unicode data is known at construction time, we can apply
1288 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290 /* Optimization for empty strings */
1291 if (size == 0 && unicode_empty != NULL) {
1292 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001293 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001294 }
Tim Petersced69f82003-09-16 20:30:58 +00001295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 /* Single character Unicode objects in the Latin-1 range are
1297 shared when using this constructor */
1298 if (size == 1 && *u < 256)
1299 return get_latin1_char((unsigned char)*u);
1300
1301 /* If not empty and not single character, copy the Unicode data
1302 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001303 if (find_maxchar_surrogates(u, u + size,
1304 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return NULL;
1306
1307 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1308 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001309 if (!unicode)
1310 return NULL;
1311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 switch (PyUnicode_KIND(unicode)) {
1313 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001314 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1316 break;
1317 case PyUnicode_2BYTE_KIND:
1318#if Py_UNICODE_SIZE == 2
1319 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1320#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001321 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1323#endif
1324 break;
1325 case PyUnicode_4BYTE_KIND:
1326#if SIZEOF_WCHAR_T == 2
1327 /* This is the only case which has to process surrogates, thus
1328 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001329 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330#else
1331 assert(num_surrogates == 0);
1332 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1333#endif
1334 break;
1335 default:
1336 assert(0 && "Impossible state");
1337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
1339 return (PyObject *)unicode;
1340}
1341
Alexander Belopolsky40018472011-02-26 01:02:56 +00001342PyObject *
1343PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001344{
1345 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001346
Benjamin Peterson14339b62009-01-31 16:36:08 +00001347 if (size < 0) {
1348 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001349 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001350 return NULL;
1351 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001352
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001353 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001354 some optimizations which share commonly used objects.
1355 Also, this means the input must be UTF-8, so fall back to the
1356 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001357 if (u != NULL) {
1358
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 /* Optimization for empty strings */
1360 if (size == 0 && unicode_empty != NULL) {
1361 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001362 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001364
1365 /* Single characters are shared when using this constructor.
1366 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 if (size == 1 && Py_CHARMASK(*u) < 128)
1368 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001369
1370 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001371 }
1372
Walter Dörwald55507312007-05-18 13:12:10 +00001373 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001374 if (!unicode)
1375 return NULL;
1376
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001377 return (PyObject *)unicode;
1378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001382{
1383 size_t size = strlen(u);
1384 if (size > PY_SSIZE_T_MAX) {
1385 PyErr_SetString(PyExc_OverflowError, "input too long");
1386 return NULL;
1387 }
1388
1389 return PyUnicode_FromStringAndSize(u, size);
1390}
1391
Victor Stinnere57b1c02011-09-28 22:20:48 +02001392static PyObject*
1393_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001394{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 PyObject *res;
1396 unsigned char max = 127;
1397 Py_ssize_t i;
1398 for (i = 0; i < size; i++) {
1399 if (u[i] & 0x80) {
1400 max = 255;
1401 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001402 }
1403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 res = PyUnicode_New(size, max);
1405 if (!res)
1406 return NULL;
1407 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1408 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001409}
1410
Victor Stinnere57b1c02011-09-28 22:20:48 +02001411static PyObject*
1412_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413{
1414 PyObject *res;
1415 Py_UCS2 max = 0;
1416 Py_ssize_t i;
1417 for (i = 0; i < size; i++)
1418 if (u[i] > max)
1419 max = u[i];
1420 res = PyUnicode_New(size, max);
1421 if (!res)
1422 return NULL;
1423 if (max >= 256)
1424 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1425 else
1426 for (i = 0; i < size; i++)
1427 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1428 return res;
1429}
1430
Victor Stinnere57b1c02011-09-28 22:20:48 +02001431static PyObject*
1432_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433{
1434 PyObject *res;
1435 Py_UCS4 max = 0;
1436 Py_ssize_t i;
1437 for (i = 0; i < size; i++)
1438 if (u[i] > max)
1439 max = u[i];
1440 res = PyUnicode_New(size, max);
1441 if (!res)
1442 return NULL;
1443 if (max >= 0x10000)
1444 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1445 else {
1446 int kind = PyUnicode_KIND(res);
1447 void *data = PyUnicode_DATA(res);
1448 for (i = 0; i < size; i++)
1449 PyUnicode_WRITE(kind, data, i, u[i]);
1450 }
1451 return res;
1452}
1453
1454PyObject*
1455PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1456{
1457 switch(kind) {
1458 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001459 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001461 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001463 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001465 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 return NULL;
1467}
1468
Victor Stinner034f6cf2011-09-30 02:26:44 +02001469PyObject*
1470PyUnicode_Copy(PyObject *unicode)
1471{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001472 Py_ssize_t size;
1473 PyObject *copy;
1474 void *data;
1475
Victor Stinner034f6cf2011-09-30 02:26:44 +02001476 if (!PyUnicode_Check(unicode)) {
1477 PyErr_BadInternalCall();
1478 return NULL;
1479 }
1480 if (PyUnicode_READY(unicode))
1481 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001482
1483 size = PyUnicode_GET_LENGTH(unicode);
1484 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1485 if (!copy)
1486 return NULL;
1487 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1488
1489 data = PyUnicode_DATA(unicode);
1490 switch (PyUnicode_KIND(unicode))
1491 {
1492 case PyUnicode_1BYTE_KIND:
1493 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1494 break;
1495 case PyUnicode_2BYTE_KIND:
1496 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1497 break;
1498 case PyUnicode_4BYTE_KIND:
1499 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1500 break;
1501 default:
1502 assert(0);
1503 break;
1504 }
1505 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001506}
1507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508
Victor Stinnerbc603d12011-10-02 01:00:40 +02001509/* Widen Unicode objects to larger buffers. Don't write terminating null
1510 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512void*
1513_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1514{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001515 Py_ssize_t len;
1516 void *result;
1517 unsigned int skind;
1518
1519 if (PyUnicode_READY(s))
1520 return NULL;
1521
1522 len = PyUnicode_GET_LENGTH(s);
1523 skind = PyUnicode_KIND(s);
1524 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1526 return NULL;
1527 }
1528 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001529 case PyUnicode_2BYTE_KIND:
1530 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1531 if (!result)
1532 return PyErr_NoMemory();
1533 assert(skind == PyUnicode_1BYTE_KIND);
1534 _PyUnicode_CONVERT_BYTES(
1535 Py_UCS1, Py_UCS2,
1536 PyUnicode_1BYTE_DATA(s),
1537 PyUnicode_1BYTE_DATA(s) + len,
1538 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001540 case PyUnicode_4BYTE_KIND:
1541 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1542 if (!result)
1543 return PyErr_NoMemory();
1544 if (skind == PyUnicode_2BYTE_KIND) {
1545 _PyUnicode_CONVERT_BYTES(
1546 Py_UCS2, Py_UCS4,
1547 PyUnicode_2BYTE_DATA(s),
1548 PyUnicode_2BYTE_DATA(s) + len,
1549 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001551 else {
1552 assert(skind == PyUnicode_1BYTE_KIND);
1553 _PyUnicode_CONVERT_BYTES(
1554 Py_UCS1, Py_UCS4,
1555 PyUnicode_1BYTE_DATA(s),
1556 PyUnicode_1BYTE_DATA(s) + len,
1557 result);
1558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001560 default:
1561 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001563 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 return NULL;
1565}
1566
1567static Py_UCS4*
1568as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1569 int copy_null)
1570{
1571 int kind;
1572 void *data;
1573 Py_ssize_t len, targetlen;
1574 if (PyUnicode_READY(string) == -1)
1575 return NULL;
1576 kind = PyUnicode_KIND(string);
1577 data = PyUnicode_DATA(string);
1578 len = PyUnicode_GET_LENGTH(string);
1579 targetlen = len;
1580 if (copy_null)
1581 targetlen++;
1582 if (!target) {
1583 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1584 PyErr_NoMemory();
1585 return NULL;
1586 }
1587 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1588 if (!target) {
1589 PyErr_NoMemory();
1590 return NULL;
1591 }
1592 }
1593 else {
1594 if (targetsize < targetlen) {
1595 PyErr_Format(PyExc_SystemError,
1596 "string is longer than the buffer");
1597 if (copy_null && 0 < targetsize)
1598 target[0] = 0;
1599 return NULL;
1600 }
1601 }
1602 if (kind != PyUnicode_4BYTE_KIND) {
1603 Py_ssize_t i;
1604 for (i = 0; i < len; i++)
1605 target[i] = PyUnicode_READ(kind, data, i);
1606 }
1607 else
1608 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1609 if (copy_null)
1610 target[len] = 0;
1611 return target;
1612}
1613
1614Py_UCS4*
1615PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1616 int copy_null)
1617{
1618 if (target == NULL || targetsize < 1) {
1619 PyErr_BadInternalCall();
1620 return NULL;
1621 }
1622 return as_ucs4(string, target, targetsize, copy_null);
1623}
1624
1625Py_UCS4*
1626PyUnicode_AsUCS4Copy(PyObject *string)
1627{
1628 return as_ucs4(string, NULL, 0, 1);
1629}
1630
1631#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001632
Alexander Belopolsky40018472011-02-26 01:02:56 +00001633PyObject *
1634PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001637 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001639 PyErr_BadInternalCall();
1640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 }
1642
Martin v. Löwis790465f2008-04-05 20:41:37 +00001643 if (size == -1) {
1644 size = wcslen(w);
1645 }
1646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001647 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648}
1649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001651
Walter Dörwald346737f2007-05-31 10:44:43 +00001652static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001653makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1654 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001655{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001656 *fmt++ = '%';
1657 if (width) {
1658 if (zeropad)
1659 *fmt++ = '0';
1660 fmt += sprintf(fmt, "%d", width);
1661 }
1662 if (precision)
1663 fmt += sprintf(fmt, ".%d", precision);
1664 if (longflag)
1665 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001666 else if (longlongflag) {
1667 /* longlongflag should only ever be nonzero on machines with
1668 HAVE_LONG_LONG defined */
1669#ifdef HAVE_LONG_LONG
1670 char *f = PY_FORMAT_LONG_LONG;
1671 while (*f)
1672 *fmt++ = *f++;
1673#else
1674 /* we shouldn't ever get here */
1675 assert(0);
1676 *fmt++ = 'l';
1677#endif
1678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001679 else if (size_tflag) {
1680 char *f = PY_FORMAT_SIZE_T;
1681 while (*f)
1682 *fmt++ = *f++;
1683 }
1684 *fmt++ = c;
1685 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001686}
1687
Victor Stinner96865452011-03-01 23:44:09 +00001688/* helper for PyUnicode_FromFormatV() */
1689
1690static const char*
1691parse_format_flags(const char *f,
1692 int *p_width, int *p_precision,
1693 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1694{
1695 int width, precision, longflag, longlongflag, size_tflag;
1696
1697 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1698 f++;
1699 width = 0;
1700 while (Py_ISDIGIT((unsigned)*f))
1701 width = (width*10) + *f++ - '0';
1702 precision = 0;
1703 if (*f == '.') {
1704 f++;
1705 while (Py_ISDIGIT((unsigned)*f))
1706 precision = (precision*10) + *f++ - '0';
1707 if (*f == '%') {
1708 /* "%.3%s" => f points to "3" */
1709 f--;
1710 }
1711 }
1712 if (*f == '\0') {
1713 /* bogus format "%.1" => go backward, f points to "1" */
1714 f--;
1715 }
1716 if (p_width != NULL)
1717 *p_width = width;
1718 if (p_precision != NULL)
1719 *p_precision = precision;
1720
1721 /* Handle %ld, %lu, %lld and %llu. */
1722 longflag = 0;
1723 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001724 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001725
1726 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001727 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001728 longflag = 1;
1729 ++f;
1730 }
1731#ifdef HAVE_LONG_LONG
1732 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001733 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001734 longlongflag = 1;
1735 f += 2;
1736 }
1737#endif
1738 }
1739 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001740 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001741 size_tflag = 1;
1742 ++f;
1743 }
1744 if (p_longflag != NULL)
1745 *p_longflag = longflag;
1746 if (p_longlongflag != NULL)
1747 *p_longlongflag = longlongflag;
1748 if (p_size_tflag != NULL)
1749 *p_size_tflag = size_tflag;
1750 return f;
1751}
1752
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001753/* maximum number of characters required for output of %ld. 21 characters
1754 allows for 64-bit integers (in decimal) and an optional sign. */
1755#define MAX_LONG_CHARS 21
1756/* maximum number of characters required for output of %lld.
1757 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1758 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1759#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1760
Walter Dörwaldd2034312007-05-18 16:29:38 +00001761PyObject *
1762PyUnicode_FromFormatV(const char *format, va_list vargs)
1763{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001764 va_list count;
1765 Py_ssize_t callcount = 0;
1766 PyObject **callresults = NULL;
1767 PyObject **callresult = NULL;
1768 Py_ssize_t n = 0;
1769 int width = 0;
1770 int precision = 0;
1771 int zeropad;
1772 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001774 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001775 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1777 Py_UCS4 argmaxchar;
1778 Py_ssize_t numbersize = 0;
1779 char *numberresults = NULL;
1780 char *numberresult = NULL;
1781 Py_ssize_t i;
1782 int kind;
1783 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001784
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001785 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001786 /* step 1: count the number of %S/%R/%A/%s format specifications
1787 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1788 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 * result in an array)
1790 * also esimate a upper bound for all the number formats in the string,
1791 * numbers will be formated in step 3 and be keept in a '\0'-separated
1792 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001793 for (f = format; *f; f++) {
1794 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001795 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1797 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1798 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1799 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001802#ifdef HAVE_LONG_LONG
1803 if (longlongflag) {
1804 if (width < MAX_LONG_LONG_CHARS)
1805 width = MAX_LONG_LONG_CHARS;
1806 }
1807 else
1808#endif
1809 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1810 including sign. Decimal takes the most space. This
1811 isn't enough for octal. If a width is specified we
1812 need more (which we allocate later). */
1813 if (width < MAX_LONG_CHARS)
1814 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815
1816 /* account for the size + '\0' to separate numbers
1817 inside of the numberresults buffer */
1818 numbersize += (width + 1);
1819 }
1820 }
1821 else if ((unsigned char)*f > 127) {
1822 PyErr_Format(PyExc_ValueError,
1823 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1824 "string, got a non-ASCII byte: 0x%02x",
1825 (unsigned char)*f);
1826 return NULL;
1827 }
1828 }
1829 /* step 2: allocate memory for the results of
1830 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1831 if (callcount) {
1832 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1833 if (!callresults) {
1834 PyErr_NoMemory();
1835 return NULL;
1836 }
1837 callresult = callresults;
1838 }
1839 /* step 2.5: allocate memory for the results of formating numbers */
1840 if (numbersize) {
1841 numberresults = PyObject_Malloc(numbersize);
1842 if (!numberresults) {
1843 PyErr_NoMemory();
1844 goto fail;
1845 }
1846 numberresult = numberresults;
1847 }
1848
1849 /* step 3: format numbers and figure out how large a buffer we need */
1850 for (f = format; *f; f++) {
1851 if (*f == '%') {
1852 const char* p;
1853 int longflag;
1854 int longlongflag;
1855 int size_tflag;
1856 int numprinted;
1857
1858 p = f;
1859 zeropad = (f[1] == '0');
1860 f = parse_format_flags(f, &width, &precision,
1861 &longflag, &longlongflag, &size_tflag);
1862 switch (*f) {
1863 case 'c':
1864 {
1865 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001866 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 n++;
1868 break;
1869 }
1870 case '%':
1871 n++;
1872 break;
1873 case 'i':
1874 case 'd':
1875 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1876 width, precision, *f);
1877 if (longflag)
1878 numprinted = sprintf(numberresult, fmt,
1879 va_arg(count, long));
1880#ifdef HAVE_LONG_LONG
1881 else if (longlongflag)
1882 numprinted = sprintf(numberresult, fmt,
1883 va_arg(count, PY_LONG_LONG));
1884#endif
1885 else if (size_tflag)
1886 numprinted = sprintf(numberresult, fmt,
1887 va_arg(count, Py_ssize_t));
1888 else
1889 numprinted = sprintf(numberresult, fmt,
1890 va_arg(count, int));
1891 n += numprinted;
1892 /* advance by +1 to skip over the '\0' */
1893 numberresult += (numprinted + 1);
1894 assert(*(numberresult - 1) == '\0');
1895 assert(*(numberresult - 2) != '\0');
1896 assert(numprinted >= 0);
1897 assert(numberresult <= numberresults + numbersize);
1898 break;
1899 case 'u':
1900 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1901 width, precision, 'u');
1902 if (longflag)
1903 numprinted = sprintf(numberresult, fmt,
1904 va_arg(count, unsigned long));
1905#ifdef HAVE_LONG_LONG
1906 else if (longlongflag)
1907 numprinted = sprintf(numberresult, fmt,
1908 va_arg(count, unsigned PY_LONG_LONG));
1909#endif
1910 else if (size_tflag)
1911 numprinted = sprintf(numberresult, fmt,
1912 va_arg(count, size_t));
1913 else
1914 numprinted = sprintf(numberresult, fmt,
1915 va_arg(count, unsigned int));
1916 n += numprinted;
1917 numberresult += (numprinted + 1);
1918 assert(*(numberresult - 1) == '\0');
1919 assert(*(numberresult - 2) != '\0');
1920 assert(numprinted >= 0);
1921 assert(numberresult <= numberresults + numbersize);
1922 break;
1923 case 'x':
1924 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1925 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1926 n += numprinted;
1927 numberresult += (numprinted + 1);
1928 assert(*(numberresult - 1) == '\0');
1929 assert(*(numberresult - 2) != '\0');
1930 assert(numprinted >= 0);
1931 assert(numberresult <= numberresults + numbersize);
1932 break;
1933 case 'p':
1934 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1935 /* %p is ill-defined: ensure leading 0x. */
1936 if (numberresult[1] == 'X')
1937 numberresult[1] = 'x';
1938 else if (numberresult[1] != 'x') {
1939 memmove(numberresult + 2, numberresult,
1940 strlen(numberresult) + 1);
1941 numberresult[0] = '0';
1942 numberresult[1] = 'x';
1943 numprinted += 2;
1944 }
1945 n += numprinted;
1946 numberresult += (numprinted + 1);
1947 assert(*(numberresult - 1) == '\0');
1948 assert(*(numberresult - 2) != '\0');
1949 assert(numprinted >= 0);
1950 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001951 break;
1952 case 's':
1953 {
1954 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001955 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001956 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1957 if (!str)
1958 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001959 /* since PyUnicode_DecodeUTF8 returns already flexible
1960 unicode objects, there is no need to call ready on them */
1961 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001962 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001964 /* Remember the str and switch to the next slot */
1965 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001966 break;
1967 }
1968 case 'U':
1969 {
1970 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001971 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 if (PyUnicode_READY(obj) == -1)
1973 goto fail;
1974 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001975 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001977 break;
1978 }
1979 case 'V':
1980 {
1981 PyObject *obj = va_arg(count, PyObject *);
1982 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001983 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001984 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02001985 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001986 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 if (PyUnicode_READY(obj) == -1)
1988 goto fail;
1989 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001990 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001992 *callresult++ = NULL;
1993 }
1994 else {
1995 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
1996 if (!str_obj)
1997 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001999 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002001 *callresult++ = str_obj;
2002 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002003 break;
2004 }
2005 case 'S':
2006 {
2007 PyObject *obj = va_arg(count, PyObject *);
2008 PyObject *str;
2009 assert(obj);
2010 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002012 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002014 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002016 /* Remember the str and switch to the next slot */
2017 *callresult++ = str;
2018 break;
2019 }
2020 case 'R':
2021 {
2022 PyObject *obj = va_arg(count, PyObject *);
2023 PyObject *repr;
2024 assert(obj);
2025 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002027 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002029 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002031 /* Remember the repr and switch to the next slot */
2032 *callresult++ = repr;
2033 break;
2034 }
2035 case 'A':
2036 {
2037 PyObject *obj = va_arg(count, PyObject *);
2038 PyObject *ascii;
2039 assert(obj);
2040 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002042 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002044 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002046 /* Remember the repr and switch to the next slot */
2047 *callresult++ = ascii;
2048 break;
2049 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002050 default:
2051 /* if we stumble upon an unknown
2052 formatting code, copy the rest of
2053 the format string to the output
2054 string. (we cannot just skip the
2055 code, since there's no way to know
2056 what's in the argument list) */
2057 n += strlen(p);
2058 goto expand;
2059 }
2060 } else
2061 n++;
2062 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002064 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002066 we don't have to resize the string.
2067 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002069 if (!string)
2070 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 kind = PyUnicode_KIND(string);
2072 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002073 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002077 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002078 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002079
2080 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2082 /* checking for == because the last argument could be a empty
2083 string, which causes i to point to end, the assert at the end of
2084 the loop */
2085 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002086
Benjamin Peterson14339b62009-01-31 16:36:08 +00002087 switch (*f) {
2088 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002089 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 const int ordinal = va_arg(vargs, int);
2091 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002093 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002094 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002095 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002096 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002097 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 case 'p':
2099 /* unused, since we already have the result */
2100 if (*f == 'p')
2101 (void) va_arg(vargs, void *);
2102 else
2103 (void) va_arg(vargs, int);
2104 /* extract the result from numberresults and append. */
2105 for (; *numberresult; ++i, ++numberresult)
2106 PyUnicode_WRITE(kind, data, i, *numberresult);
2107 /* skip over the separating '\0' */
2108 assert(*numberresult == '\0');
2109 numberresult++;
2110 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002111 break;
2112 case 's':
2113 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002114 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002116 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117 size = PyUnicode_GET_LENGTH(*callresult);
2118 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002119 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2120 *callresult, 0,
2121 size) < 0)
2122 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002124 /* We're done with the unicode()/repr() => forget it */
2125 Py_DECREF(*callresult);
2126 /* switch to next unicode()/repr() result */
2127 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002128 break;
2129 }
2130 case 'U':
2131 {
2132 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 Py_ssize_t size;
2134 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2135 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002136 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2137 obj, 0,
2138 size) < 0)
2139 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 break;
2142 }
2143 case 'V':
2144 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002146 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002147 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002148 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 size = PyUnicode_GET_LENGTH(obj);
2150 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002151 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2152 obj, 0,
2153 size) < 0)
2154 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 size = PyUnicode_GET_LENGTH(*callresult);
2158 assert(PyUnicode_KIND(*callresult) <=
2159 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002160 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2161 *callresult,
2162 0, size) < 0)
2163 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002165 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002167 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002168 break;
2169 }
2170 case 'S':
2171 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002172 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002173 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 /* unused, since we already have the result */
2175 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002177 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2178 *callresult, 0,
2179 PyUnicode_GET_LENGTH(*callresult)) < 0)
2180 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002182 /* We're done with the unicode()/repr() => forget it */
2183 Py_DECREF(*callresult);
2184 /* switch to next unicode()/repr() result */
2185 ++callresult;
2186 break;
2187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002190 break;
2191 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 for (; *p; ++p, ++i)
2193 PyUnicode_WRITE(kind, data, i, *p);
2194 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002195 goto end;
2196 }
Victor Stinner1205f272010-09-11 00:54:47 +00002197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 else {
2199 assert(i < PyUnicode_GET_LENGTH(string));
2200 PyUnicode_WRITE(kind, data, i++, *f);
2201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002204
Benjamin Peterson29060642009-01-31 22:14:21 +00002205 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 if (callresults)
2207 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 if (numberresults)
2209 PyObject_Free(numberresults);
2210 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002211 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 if (callresults) {
2213 PyObject **callresult2 = callresults;
2214 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002215 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002216 ++callresult2;
2217 }
2218 PyObject_Free(callresults);
2219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (numberresults)
2221 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002222 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002223}
2224
Walter Dörwaldd2034312007-05-18 16:29:38 +00002225PyObject *
2226PyUnicode_FromFormat(const char *format, ...)
2227{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 PyObject* ret;
2229 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002230
2231#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002233#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002234 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002235#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 ret = PyUnicode_FromFormatV(format, vargs);
2237 va_end(vargs);
2238 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002239}
2240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241#ifdef HAVE_WCHAR_H
2242
Victor Stinner5593d8a2010-10-02 11:11:27 +00002243/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2244 convert a Unicode object to a wide character string.
2245
Victor Stinnerd88d9832011-09-06 02:00:05 +02002246 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002247 character) required to convert the unicode object. Ignore size argument.
2248
Victor Stinnerd88d9832011-09-06 02:00:05 +02002249 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002250 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002251 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002252static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002253unicode_aswidechar(PyUnicodeObject *unicode,
2254 wchar_t *w,
2255 Py_ssize_t size)
2256{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002257 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 const wchar_t *wstr;
2259
2260 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2261 if (wstr == NULL)
2262 return -1;
2263
Victor Stinner5593d8a2010-10-02 11:11:27 +00002264 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002265 if (size > res)
2266 size = res + 1;
2267 else
2268 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002270 return res;
2271 }
2272 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002274}
2275
2276Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002277PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002278 wchar_t *w,
2279 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280{
2281 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002282 PyErr_BadInternalCall();
2283 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002285 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286}
2287
Victor Stinner137c34c2010-09-29 10:25:54 +00002288wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002289PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002290 Py_ssize_t *size)
2291{
2292 wchar_t* buffer;
2293 Py_ssize_t buflen;
2294
2295 if (unicode == NULL) {
2296 PyErr_BadInternalCall();
2297 return NULL;
2298 }
2299
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002300 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 if (buflen == -1)
2302 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002303 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002304 PyErr_NoMemory();
2305 return NULL;
2306 }
2307
Victor Stinner137c34c2010-09-29 10:25:54 +00002308 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2309 if (buffer == NULL) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002313 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 if (buflen == -1)
2315 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002316 if (size != NULL)
2317 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002318 return buffer;
2319}
2320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002321#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322
Alexander Belopolsky40018472011-02-26 01:02:56 +00002323PyObject *
2324PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002326 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002327 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002328 PyErr_SetString(PyExc_ValueError,
2329 "chr() arg not in range(0x110000)");
2330 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002331 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 if (ordinal < 256)
2334 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336 v = PyUnicode_New(1, ordinal);
2337 if (v == NULL)
2338 return NULL;
2339 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2340 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002341}
2342
Alexander Belopolsky40018472011-02-26 01:02:56 +00002343PyObject *
2344PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002346 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002347 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002348 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002349 if (PyUnicode_READY(obj))
2350 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002351 Py_INCREF(obj);
2352 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002353 }
2354 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002355 /* For a Unicode subtype that's not a Unicode object,
2356 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002357 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002358 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002359 PyErr_Format(PyExc_TypeError,
2360 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002361 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002362 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002363}
2364
Alexander Belopolsky40018472011-02-26 01:02:56 +00002365PyObject *
2366PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002367 const char *encoding,
2368 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002369{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002370 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002371 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002372
Guido van Rossumd57fd912000-03-10 22:53:23 +00002373 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 PyErr_BadInternalCall();
2375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002377
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002378 /* Decoding bytes objects is the most common case and should be fast */
2379 if (PyBytes_Check(obj)) {
2380 if (PyBytes_GET_SIZE(obj) == 0) {
2381 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002382 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002383 }
2384 else {
2385 v = PyUnicode_Decode(
2386 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2387 encoding, errors);
2388 }
2389 return v;
2390 }
2391
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002392 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002393 PyErr_SetString(PyExc_TypeError,
2394 "decoding str is not supported");
2395 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002396 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002397
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002398 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2399 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2400 PyErr_Format(PyExc_TypeError,
2401 "coercing to str: need bytes, bytearray "
2402 "or buffer-like object, %.80s found",
2403 Py_TYPE(obj)->tp_name);
2404 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002405 }
Tim Petersced69f82003-09-16 20:30:58 +00002406
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002407 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002408 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002409 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 }
Tim Petersced69f82003-09-16 20:30:58 +00002411 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002412 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002414 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002415 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416}
2417
Victor Stinner600d3be2010-06-10 12:00:55 +00002418/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002419 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2420 1 on success. */
2421static int
2422normalize_encoding(const char *encoding,
2423 char *lower,
2424 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002426 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002427 char *l;
2428 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002429
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002430 e = encoding;
2431 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002432 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002433 while (*e) {
2434 if (l == l_end)
2435 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002436 if (Py_ISUPPER(*e)) {
2437 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002438 }
2439 else if (*e == '_') {
2440 *l++ = '-';
2441 e++;
2442 }
2443 else {
2444 *l++ = *e++;
2445 }
2446 }
2447 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002448 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002449}
2450
Alexander Belopolsky40018472011-02-26 01:02:56 +00002451PyObject *
2452PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002453 Py_ssize_t size,
2454 const char *encoding,
2455 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002456{
2457 PyObject *buffer = NULL, *unicode;
2458 Py_buffer info;
2459 char lower[11]; /* Enough for any encoding shortcut */
2460
2461 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002462 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002463
2464 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002465 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002466 if ((strcmp(lower, "utf-8") == 0) ||
2467 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002468 return PyUnicode_DecodeUTF8(s, size, errors);
2469 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002470 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002471 (strcmp(lower, "iso-8859-1") == 0))
2472 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002473#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002474 else if (strcmp(lower, "mbcs") == 0)
2475 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002476#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002477 else if (strcmp(lower, "ascii") == 0)
2478 return PyUnicode_DecodeASCII(s, size, errors);
2479 else if (strcmp(lower, "utf-16") == 0)
2480 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2481 else if (strcmp(lower, "utf-32") == 0)
2482 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484
2485 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002486 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002487 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002488 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002489 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 if (buffer == NULL)
2491 goto onError;
2492 unicode = PyCodec_Decode(buffer, encoding, errors);
2493 if (unicode == NULL)
2494 goto onError;
2495 if (!PyUnicode_Check(unicode)) {
2496 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002497 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002498 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499 Py_DECREF(unicode);
2500 goto onError;
2501 }
2502 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 if (PyUnicode_READY(unicode)) {
2504 Py_DECREF(unicode);
2505 return NULL;
2506 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002508
Benjamin Peterson29060642009-01-31 22:14:21 +00002509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002510 Py_XDECREF(buffer);
2511 return NULL;
2512}
2513
Alexander Belopolsky40018472011-02-26 01:02:56 +00002514PyObject *
2515PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002516 const char *encoding,
2517 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002518{
2519 PyObject *v;
2520
2521 if (!PyUnicode_Check(unicode)) {
2522 PyErr_BadArgument();
2523 goto onError;
2524 }
2525
2526 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002527 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002528
2529 /* Decode via the codec registry */
2530 v = PyCodec_Decode(unicode, encoding, errors);
2531 if (v == NULL)
2532 goto onError;
2533 return v;
2534
Benjamin Peterson29060642009-01-31 22:14:21 +00002535 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002536 return NULL;
2537}
2538
Alexander Belopolsky40018472011-02-26 01:02:56 +00002539PyObject *
2540PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002541 const char *encoding,
2542 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002543{
2544 PyObject *v;
2545
2546 if (!PyUnicode_Check(unicode)) {
2547 PyErr_BadArgument();
2548 goto onError;
2549 }
2550
2551 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002552 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002553
2554 /* Decode via the codec registry */
2555 v = PyCodec_Decode(unicode, encoding, errors);
2556 if (v == NULL)
2557 goto onError;
2558 if (!PyUnicode_Check(v)) {
2559 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002560 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002561 Py_TYPE(v)->tp_name);
2562 Py_DECREF(v);
2563 goto onError;
2564 }
2565 return v;
2566
Benjamin Peterson29060642009-01-31 22:14:21 +00002567 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002568 return NULL;
2569}
2570
Alexander Belopolsky40018472011-02-26 01:02:56 +00002571PyObject *
2572PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002573 Py_ssize_t size,
2574 const char *encoding,
2575 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002576{
2577 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002578
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 unicode = PyUnicode_FromUnicode(s, size);
2580 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2583 Py_DECREF(unicode);
2584 return v;
2585}
2586
Alexander Belopolsky40018472011-02-26 01:02:56 +00002587PyObject *
2588PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002589 const char *encoding,
2590 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002591{
2592 PyObject *v;
2593
2594 if (!PyUnicode_Check(unicode)) {
2595 PyErr_BadArgument();
2596 goto onError;
2597 }
2598
2599 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002601
2602 /* Encode via the codec registry */
2603 v = PyCodec_Encode(unicode, encoding, errors);
2604 if (v == NULL)
2605 goto onError;
2606 return v;
2607
Benjamin Peterson29060642009-01-31 22:14:21 +00002608 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002609 return NULL;
2610}
2611
Victor Stinnerad158722010-10-27 00:25:46 +00002612PyObject *
2613PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002614{
Victor Stinner99b95382011-07-04 14:23:54 +02002615#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002616 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2617 PyUnicode_GET_SIZE(unicode),
2618 NULL);
2619#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002621#else
Victor Stinner793b5312011-04-27 00:24:21 +02002622 PyInterpreterState *interp = PyThreadState_GET()->interp;
2623 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2624 cannot use it to encode and decode filenames before it is loaded. Load
2625 the Python codec requires to encode at least its own filename. Use the C
2626 version of the locale codec until the codec registry is initialized and
2627 the Python codec is loaded.
2628
2629 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2630 cannot only rely on it: check also interp->fscodec_initialized for
2631 subinterpreters. */
2632 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002633 return PyUnicode_AsEncodedString(unicode,
2634 Py_FileSystemDefaultEncoding,
2635 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002636 }
2637 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002638 /* locale encoding with surrogateescape */
2639 wchar_t *wchar;
2640 char *bytes;
2641 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002642 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002643
2644 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2645 if (wchar == NULL)
2646 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002647 bytes = _Py_wchar2char(wchar, &error_pos);
2648 if (bytes == NULL) {
2649 if (error_pos != (size_t)-1) {
2650 char *errmsg = strerror(errno);
2651 PyObject *exc = NULL;
2652 if (errmsg == NULL)
2653 errmsg = "Py_wchar2char() failed";
2654 raise_encode_exception(&exc,
2655 "filesystemencoding",
2656 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2657 error_pos, error_pos+1,
2658 errmsg);
2659 Py_XDECREF(exc);
2660 }
2661 else
2662 PyErr_NoMemory();
2663 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002664 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002665 }
2666 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002667
2668 bytes_obj = PyBytes_FromString(bytes);
2669 PyMem_Free(bytes);
2670 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002671 }
Victor Stinnerad158722010-10-27 00:25:46 +00002672#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002673}
2674
Alexander Belopolsky40018472011-02-26 01:02:56 +00002675PyObject *
2676PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002677 const char *encoding,
2678 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679{
2680 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002681 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002682
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683 if (!PyUnicode_Check(unicode)) {
2684 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002685 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686 }
Fred Drakee4315f52000-05-09 19:53:39 +00002687
Victor Stinner2f283c22011-03-02 01:21:46 +00002688 if (encoding == NULL) {
2689 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002691 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002693 }
Fred Drakee4315f52000-05-09 19:53:39 +00002694
2695 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002696 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002697 if ((strcmp(lower, "utf-8") == 0) ||
2698 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002699 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002700 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002702 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002704 }
Victor Stinner37296e82010-06-10 13:36:23 +00002705 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002706 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002707 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002709#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002710 else if (strcmp(lower, "mbcs") == 0)
2711 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2712 PyUnicode_GET_SIZE(unicode),
2713 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002714#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002715 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718
2719 /* Encode via the codec registry */
2720 v = PyCodec_Encode(unicode, encoding, errors);
2721 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002722 return NULL;
2723
2724 /* The normal path */
2725 if (PyBytes_Check(v))
2726 return v;
2727
2728 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002729 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002730 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002731 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002732
2733 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2734 "encoder %s returned bytearray instead of bytes",
2735 encoding);
2736 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002737 Py_DECREF(v);
2738 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002739 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002740
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002741 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2742 Py_DECREF(v);
2743 return b;
2744 }
2745
2746 PyErr_Format(PyExc_TypeError,
2747 "encoder did not return a bytes object (type=%.400s)",
2748 Py_TYPE(v)->tp_name);
2749 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002750 return NULL;
2751}
2752
Alexander Belopolsky40018472011-02-26 01:02:56 +00002753PyObject *
2754PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002755 const char *encoding,
2756 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002757{
2758 PyObject *v;
2759
2760 if (!PyUnicode_Check(unicode)) {
2761 PyErr_BadArgument();
2762 goto onError;
2763 }
2764
2765 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002767
2768 /* Encode via the codec registry */
2769 v = PyCodec_Encode(unicode, encoding, errors);
2770 if (v == NULL)
2771 goto onError;
2772 if (!PyUnicode_Check(v)) {
2773 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002774 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002775 Py_TYPE(v)->tp_name);
2776 Py_DECREF(v);
2777 goto onError;
2778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002780
Benjamin Peterson29060642009-01-31 22:14:21 +00002781 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 return NULL;
2783}
2784
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002785PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002786PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002787 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002788 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2789}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002790
Christian Heimes5894ba72007-11-04 11:43:14 +00002791PyObject*
2792PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2793{
Victor Stinner99b95382011-07-04 14:23:54 +02002794#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002795 return PyUnicode_DecodeMBCS(s, size, NULL);
2796#elif defined(__APPLE__)
2797 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2798#else
Victor Stinner793b5312011-04-27 00:24:21 +02002799 PyInterpreterState *interp = PyThreadState_GET()->interp;
2800 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2801 cannot use it to encode and decode filenames before it is loaded. Load
2802 the Python codec requires to encode at least its own filename. Use the C
2803 version of the locale codec until the codec registry is initialized and
2804 the Python codec is loaded.
2805
2806 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2807 cannot only rely on it: check also interp->fscodec_initialized for
2808 subinterpreters. */
2809 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002810 return PyUnicode_Decode(s, size,
2811 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002812 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002813 }
2814 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002815 /* locale encoding with surrogateescape */
2816 wchar_t *wchar;
2817 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002818 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002819
2820 if (s[size] != '\0' || size != strlen(s)) {
2821 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2822 return NULL;
2823 }
2824
Victor Stinner168e1172010-10-16 23:16:16 +00002825 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002826 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002827 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002828
Victor Stinner168e1172010-10-16 23:16:16 +00002829 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002830 PyMem_Free(wchar);
2831 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002832 }
Victor Stinnerad158722010-10-27 00:25:46 +00002833#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002834}
2835
Martin v. Löwis011e8422009-05-05 04:43:17 +00002836
2837int
2838PyUnicode_FSConverter(PyObject* arg, void* addr)
2839{
2840 PyObject *output = NULL;
2841 Py_ssize_t size;
2842 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002843 if (arg == NULL) {
2844 Py_DECREF(*(PyObject**)addr);
2845 return 1;
2846 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002847 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002848 output = arg;
2849 Py_INCREF(output);
2850 }
2851 else {
2852 arg = PyUnicode_FromObject(arg);
2853 if (!arg)
2854 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002855 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002856 Py_DECREF(arg);
2857 if (!output)
2858 return 0;
2859 if (!PyBytes_Check(output)) {
2860 Py_DECREF(output);
2861 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2862 return 0;
2863 }
2864 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002865 size = PyBytes_GET_SIZE(output);
2866 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002867 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002868 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002869 Py_DECREF(output);
2870 return 0;
2871 }
2872 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002873 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002874}
2875
2876
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002877int
2878PyUnicode_FSDecoder(PyObject* arg, void* addr)
2879{
2880 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002881 if (arg == NULL) {
2882 Py_DECREF(*(PyObject**)addr);
2883 return 1;
2884 }
2885 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002886 if (PyUnicode_READY(arg))
2887 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002888 output = arg;
2889 Py_INCREF(output);
2890 }
2891 else {
2892 arg = PyBytes_FromObject(arg);
2893 if (!arg)
2894 return 0;
2895 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2896 PyBytes_GET_SIZE(arg));
2897 Py_DECREF(arg);
2898 if (!output)
2899 return 0;
2900 if (!PyUnicode_Check(output)) {
2901 Py_DECREF(output);
2902 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2903 return 0;
2904 }
2905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002906 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2907 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002908 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2909 Py_DECREF(output);
2910 return 0;
2911 }
2912 *(PyObject**)addr = output;
2913 return Py_CLEANUP_SUPPORTED;
2914}
2915
2916
Martin v. Löwis5b222132007-06-10 09:51:05 +00002917char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002918PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002919{
Christian Heimesf3863112007-11-22 07:46:41 +00002920 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002921 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2922
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002923 if (!PyUnicode_Check(unicode)) {
2924 PyErr_BadArgument();
2925 return NULL;
2926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002927 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002928 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002929
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002930 if (PyUnicode_UTF8(unicode) == NULL) {
2931 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002932 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2933 if (bytes == NULL)
2934 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002935 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2936 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002937 Py_DECREF(bytes);
2938 return NULL;
2939 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002940 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2941 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942 Py_DECREF(bytes);
2943 }
2944
2945 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002946 *psize = PyUnicode_UTF8_LENGTH(unicode);
2947 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002948}
2949
2950char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002951PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2954}
2955
2956#ifdef Py_DEBUG
2957int unicode_as_unicode_calls = 0;
2958#endif
2959
2960
2961Py_UNICODE *
2962PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2963{
2964 PyUnicodeObject *u;
2965 const unsigned char *one_byte;
2966#if SIZEOF_WCHAR_T == 4
2967 const Py_UCS2 *two_bytes;
2968#else
2969 const Py_UCS4 *four_bytes;
2970 const Py_UCS4 *ucs4_end;
2971 Py_ssize_t num_surrogates;
2972#endif
2973 wchar_t *w;
2974 wchar_t *wchar_end;
2975
2976 if (!PyUnicode_Check(unicode)) {
2977 PyErr_BadArgument();
2978 return NULL;
2979 }
2980 u = (PyUnicodeObject*)unicode;
2981 if (_PyUnicode_WSTR(u) == NULL) {
2982 /* Non-ASCII compact unicode object */
2983 assert(_PyUnicode_KIND(u) != 0);
2984 assert(PyUnicode_IS_READY(u));
2985
2986#ifdef Py_DEBUG
2987 ++unicode_as_unicode_calls;
2988#endif
2989
2990 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
2991#if SIZEOF_WCHAR_T == 2
2992 four_bytes = PyUnicode_4BYTE_DATA(u);
2993 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
2994 num_surrogates = 0;
2995
2996 for (; four_bytes < ucs4_end; ++four_bytes) {
2997 if (*four_bytes > 0xFFFF)
2998 ++num_surrogates;
2999 }
3000
3001 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3002 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3003 if (!_PyUnicode_WSTR(u)) {
3004 PyErr_NoMemory();
3005 return NULL;
3006 }
3007 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3008
3009 w = _PyUnicode_WSTR(u);
3010 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3011 four_bytes = PyUnicode_4BYTE_DATA(u);
3012 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3013 if (*four_bytes > 0xFFFF) {
3014 /* encode surrogate pair in this case */
3015 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3016 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3017 }
3018 else
3019 *w = *four_bytes;
3020
3021 if (w > wchar_end) {
3022 assert(0 && "Miscalculated string end");
3023 }
3024 }
3025 *w = 0;
3026#else
3027 /* sizeof(wchar_t) == 4 */
3028 Py_FatalError("Impossible unicode object state, wstr and str "
3029 "should share memory already.");
3030 return NULL;
3031#endif
3032 }
3033 else {
3034 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3035 (_PyUnicode_LENGTH(u) + 1));
3036 if (!_PyUnicode_WSTR(u)) {
3037 PyErr_NoMemory();
3038 return NULL;
3039 }
3040 if (!PyUnicode_IS_COMPACT_ASCII(u))
3041 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3042 w = _PyUnicode_WSTR(u);
3043 wchar_end = w + _PyUnicode_LENGTH(u);
3044
3045 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3046 one_byte = PyUnicode_1BYTE_DATA(u);
3047 for (; w < wchar_end; ++one_byte, ++w)
3048 *w = *one_byte;
3049 /* null-terminate the wstr */
3050 *w = 0;
3051 }
3052 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3053#if SIZEOF_WCHAR_T == 4
3054 two_bytes = PyUnicode_2BYTE_DATA(u);
3055 for (; w < wchar_end; ++two_bytes, ++w)
3056 *w = *two_bytes;
3057 /* null-terminate the wstr */
3058 *w = 0;
3059#else
3060 /* sizeof(wchar_t) == 2 */
3061 PyObject_FREE(_PyUnicode_WSTR(u));
3062 _PyUnicode_WSTR(u) = NULL;
3063 Py_FatalError("Impossible unicode object state, wstr "
3064 "and str should share memory already.");
3065 return NULL;
3066#endif
3067 }
3068 else {
3069 assert(0 && "This should never happen.");
3070 }
3071 }
3072 }
3073 if (size != NULL)
3074 *size = PyUnicode_WSTR_LENGTH(u);
3075 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003076}
3077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078Py_UNICODE *
3079PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003081 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082}
3083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003084
Alexander Belopolsky40018472011-02-26 01:02:56 +00003085Py_ssize_t
3086PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
3088 if (!PyUnicode_Check(unicode)) {
3089 PyErr_BadArgument();
3090 goto onError;
3091 }
3092 return PyUnicode_GET_SIZE(unicode);
3093
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 return -1;
3096}
3097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098Py_ssize_t
3099PyUnicode_GetLength(PyObject *unicode)
3100{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003101 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003102 PyErr_BadArgument();
3103 return -1;
3104 }
3105
3106 return PyUnicode_GET_LENGTH(unicode);
3107}
3108
3109Py_UCS4
3110PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3111{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003112 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3113 PyErr_BadArgument();
3114 return (Py_UCS4)-1;
3115 }
3116 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3117 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 return (Py_UCS4)-1;
3119 }
3120 return PyUnicode_READ_CHAR(unicode, index);
3121}
3122
3123int
3124PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3125{
3126 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003127 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003128 return -1;
3129 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003130 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3131 PyErr_SetString(PyExc_IndexError, "string index out of range");
3132 return -1;
3133 }
3134 if (_PyUnicode_Dirty(unicode))
3135 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003136 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3137 index, ch);
3138 return 0;
3139}
3140
Alexander Belopolsky40018472011-02-26 01:02:56 +00003141const char *
3142PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003143{
Victor Stinner42cb4622010-09-01 19:39:01 +00003144 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003145}
3146
Victor Stinner554f3f02010-06-16 23:33:54 +00003147/* create or adjust a UnicodeDecodeError */
3148static void
3149make_decode_exception(PyObject **exceptionObject,
3150 const char *encoding,
3151 const char *input, Py_ssize_t length,
3152 Py_ssize_t startpos, Py_ssize_t endpos,
3153 const char *reason)
3154{
3155 if (*exceptionObject == NULL) {
3156 *exceptionObject = PyUnicodeDecodeError_Create(
3157 encoding, input, length, startpos, endpos, reason);
3158 }
3159 else {
3160 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3161 goto onError;
3162 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3163 goto onError;
3164 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3165 goto onError;
3166 }
3167 return;
3168
3169onError:
3170 Py_DECREF(*exceptionObject);
3171 *exceptionObject = NULL;
3172}
3173
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174/* error handling callback helper:
3175 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003176 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 and adjust various state variables.
3178 return 0 on success, -1 on error
3179*/
3180
Alexander Belopolsky40018472011-02-26 01:02:56 +00003181static int
3182unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003183 const char *encoding, const char *reason,
3184 const char **input, const char **inend, Py_ssize_t *startinpos,
3185 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3186 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003187{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003188 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189
3190 PyObject *restuple = NULL;
3191 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003192 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003193 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003194 Py_ssize_t requiredsize;
3195 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003196 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003197 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003198 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199 int res = -1;
3200
3201 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 *errorHandler = PyCodec_LookupError(errors);
3203 if (*errorHandler == NULL)
3204 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003205 }
3206
Victor Stinner554f3f02010-06-16 23:33:54 +00003207 make_decode_exception(exceptionObject,
3208 encoding,
3209 *input, *inend - *input,
3210 *startinpos, *endinpos,
3211 reason);
3212 if (*exceptionObject == NULL)
3213 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003214
3215 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3216 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003219 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003221 }
3222 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003224
3225 /* Copy back the bytes variables, which might have been modified by the
3226 callback */
3227 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3228 if (!inputobj)
3229 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003230 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003232 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003233 *input = PyBytes_AS_STRING(inputobj);
3234 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003235 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003236 /* we can DECREF safely, as the exception has another reference,
3237 so the object won't go away. */
3238 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003240 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003242 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3244 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003245 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246
3247 /* need more space? (at least enough for what we
3248 have+the replacement+the rest of the string (starting
3249 at the new input position), so we won't have to check space
3250 when there are no errors in the rest of the string) */
3251 repptr = PyUnicode_AS_UNICODE(repunicode);
3252 repsize = PyUnicode_GET_SIZE(repunicode);
3253 requiredsize = *outpos + repsize + insize-newpos;
3254 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 if (requiredsize<2*outsize)
3256 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003257 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 goto onError;
3259 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 }
3261 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003262 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_UNICODE_COPY(*outptr, repptr, repsize);
3264 *outptr += repsize;
3265 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 /* we made it! */
3268 res = 0;
3269
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 Py_XDECREF(restuple);
3272 return res;
3273}
3274
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003275/* --- UTF-7 Codec -------------------------------------------------------- */
3276
Antoine Pitrou244651a2009-05-04 18:56:13 +00003277/* See RFC2152 for details. We encode conservatively and decode liberally. */
3278
3279/* Three simple macros defining base-64. */
3280
3281/* Is c a base-64 character? */
3282
3283#define IS_BASE64(c) \
3284 (((c) >= 'A' && (c) <= 'Z') || \
3285 ((c) >= 'a' && (c) <= 'z') || \
3286 ((c) >= '0' && (c) <= '9') || \
3287 (c) == '+' || (c) == '/')
3288
3289/* given that c is a base-64 character, what is its base-64 value? */
3290
3291#define FROM_BASE64(c) \
3292 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3293 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3294 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3295 (c) == '+' ? 62 : 63)
3296
3297/* What is the base-64 character of the bottom 6 bits of n? */
3298
3299#define TO_BASE64(n) \
3300 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3301
3302/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3303 * decoded as itself. We are permissive on decoding; the only ASCII
3304 * byte not decoding to itself is the + which begins a base64
3305 * string. */
3306
3307#define DECODE_DIRECT(c) \
3308 ((c) <= 127 && (c) != '+')
3309
3310/* The UTF-7 encoder treats ASCII characters differently according to
3311 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3312 * the above). See RFC2152. This array identifies these different
3313 * sets:
3314 * 0 : "Set D"
3315 * alphanumeric and '(),-./:?
3316 * 1 : "Set O"
3317 * !"#$%&*;<=>@[]^_`{|}
3318 * 2 : "whitespace"
3319 * ht nl cr sp
3320 * 3 : special (must be base64 encoded)
3321 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3322 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003323
Tim Petersced69f82003-09-16 20:30:58 +00003324static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003325char utf7_category[128] = {
3326/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3327 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3328/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3329 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3330/* sp ! " # $ % & ' ( ) * + , - . / */
3331 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3332/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3334/* @ A B C D E F G H I J K L M N O */
3335 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3336/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3337 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3338/* ` a b c d e f g h i j k l m n o */
3339 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3340/* p q r s t u v w x y z { | } ~ del */
3341 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003342};
3343
Antoine Pitrou244651a2009-05-04 18:56:13 +00003344/* ENCODE_DIRECT: this character should be encoded as itself. The
3345 * answer depends on whether we are encoding set O as itself, and also
3346 * on whether we are encoding whitespace as itself. RFC2152 makes it
3347 * clear that the answers to these questions vary between
3348 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003349
Antoine Pitrou244651a2009-05-04 18:56:13 +00003350#define ENCODE_DIRECT(c, directO, directWS) \
3351 ((c) < 128 && (c) > 0 && \
3352 ((utf7_category[(c)] == 0) || \
3353 (directWS && (utf7_category[(c)] == 2)) || \
3354 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003355
Alexander Belopolsky40018472011-02-26 01:02:56 +00003356PyObject *
3357PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003358 Py_ssize_t size,
3359 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003360{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003361 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3362}
3363
Antoine Pitrou244651a2009-05-04 18:56:13 +00003364/* The decoder. The only state we preserve is our read position,
3365 * i.e. how many characters we have consumed. So if we end in the
3366 * middle of a shift sequence we have to back off the read position
3367 * and the output to the beginning of the sequence, otherwise we lose
3368 * all the shift state (seen bits, number of bits seen, high
3369 * surrogate). */
3370
Alexander Belopolsky40018472011-02-26 01:02:56 +00003371PyObject *
3372PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003373 Py_ssize_t size,
3374 const char *errors,
3375 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003376{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003377 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003378 Py_ssize_t startinpos;
3379 Py_ssize_t endinpos;
3380 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003381 const char *e;
3382 PyUnicodeObject *unicode;
3383 Py_UNICODE *p;
3384 const char *errmsg = "";
3385 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003386 Py_UNICODE *shiftOutStart;
3387 unsigned int base64bits = 0;
3388 unsigned long base64buffer = 0;
3389 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 PyObject *errorHandler = NULL;
3391 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003392
3393 unicode = _PyUnicode_New(size);
3394 if (!unicode)
3395 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003396 if (size == 0) {
3397 if (consumed)
3398 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003399 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003403 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003404 e = s + size;
3405
3406 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003409 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003410
Antoine Pitrou244651a2009-05-04 18:56:13 +00003411 if (inShift) { /* in a base-64 section */
3412 if (IS_BASE64(ch)) { /* consume a base-64 character */
3413 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3414 base64bits += 6;
3415 s++;
3416 if (base64bits >= 16) {
3417 /* we have enough bits for a UTF-16 value */
3418 Py_UNICODE outCh = (Py_UNICODE)
3419 (base64buffer >> (base64bits-16));
3420 base64bits -= 16;
3421 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3422 if (surrogate) {
3423 /* expecting a second surrogate */
3424 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3425#ifdef Py_UNICODE_WIDE
3426 *p++ = (((surrogate & 0x3FF)<<10)
3427 | (outCh & 0x3FF)) + 0x10000;
3428#else
3429 *p++ = surrogate;
3430 *p++ = outCh;
3431#endif
3432 surrogate = 0;
3433 }
3434 else {
3435 surrogate = 0;
3436 errmsg = "second surrogate missing";
3437 goto utf7Error;
3438 }
3439 }
3440 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3441 /* first surrogate */
3442 surrogate = outCh;
3443 }
3444 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3445 errmsg = "unexpected second surrogate";
3446 goto utf7Error;
3447 }
3448 else {
3449 *p++ = outCh;
3450 }
3451 }
3452 }
3453 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003454 inShift = 0;
3455 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003456 if (surrogate) {
3457 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003458 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003459 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003460 if (base64bits > 0) { /* left-over bits */
3461 if (base64bits >= 6) {
3462 /* We've seen at least one base-64 character */
3463 errmsg = "partial character in shift sequence";
3464 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003465 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003466 else {
3467 /* Some bits remain; they should be zero */
3468 if (base64buffer != 0) {
3469 errmsg = "non-zero padding bits in shift sequence";
3470 goto utf7Error;
3471 }
3472 }
3473 }
3474 if (ch != '-') {
3475 /* '-' is absorbed; other terminating
3476 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003477 *p++ = ch;
3478 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003479 }
3480 }
3481 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003483 s++; /* consume '+' */
3484 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003485 s++;
3486 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003487 }
3488 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003489 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003490 shiftOutStart = p;
3491 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003492 }
3493 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003494 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003495 *p++ = ch;
3496 s++;
3497 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003498 else {
3499 startinpos = s-starts;
3500 s++;
3501 errmsg = "unexpected special character";
3502 goto utf7Error;
3503 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003504 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003505utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 outpos = p-PyUnicode_AS_UNICODE(unicode);
3507 endinpos = s-starts;
3508 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 errors, &errorHandler,
3510 "utf7", errmsg,
3511 &starts, &e, &startinpos, &endinpos, &exc, &s,
3512 &unicode, &outpos, &p))
3513 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003514 }
3515
Antoine Pitrou244651a2009-05-04 18:56:13 +00003516 /* end of string */
3517
3518 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3519 /* if we're in an inconsistent state, that's an error */
3520 if (surrogate ||
3521 (base64bits >= 6) ||
3522 (base64bits > 0 && base64buffer != 0)) {
3523 outpos = p-PyUnicode_AS_UNICODE(unicode);
3524 endinpos = size;
3525 if (unicode_decode_call_errorhandler(
3526 errors, &errorHandler,
3527 "utf7", "unterminated shift sequence",
3528 &starts, &e, &startinpos, &endinpos, &exc, &s,
3529 &unicode, &outpos, &p))
3530 goto onError;
3531 if (s < e)
3532 goto restart;
3533 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003534 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003535
3536 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003537 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003538 if (inShift) {
3539 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003540 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003541 }
3542 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003543 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003544 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003545 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003546
Victor Stinnerfe226c02011-10-03 03:52:20 +02003547 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003548 goto onError;
3549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 Py_XDECREF(errorHandler);
3551 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 if (PyUnicode_READY(unicode) == -1) {
3553 Py_DECREF(unicode);
3554 return NULL;
3555 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003556 return (PyObject *)unicode;
3557
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_XDECREF(errorHandler);
3560 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561 Py_DECREF(unicode);
3562 return NULL;
3563}
3564
3565
Alexander Belopolsky40018472011-02-26 01:02:56 +00003566PyObject *
3567PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003568 Py_ssize_t size,
3569 int base64SetO,
3570 int base64WhiteSpace,
3571 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003572{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003573 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003574 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003575 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003576 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003577 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003578 unsigned int base64bits = 0;
3579 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003580 char * out;
3581 char * start;
3582
3583 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003586 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003587 return PyErr_NoMemory();
3588
Antoine Pitrou244651a2009-05-04 18:56:13 +00003589 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003590 if (v == NULL)
3591 return NULL;
3592
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003593 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003594 for (;i < size; ++i) {
3595 Py_UNICODE ch = s[i];
3596
Antoine Pitrou244651a2009-05-04 18:56:13 +00003597 if (inShift) {
3598 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3599 /* shifting out */
3600 if (base64bits) { /* output remaining bits */
3601 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3602 base64buffer = 0;
3603 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 }
3605 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003606 /* Characters not in the BASE64 set implicitly unshift the sequence
3607 so no '-' is required, except if the character is itself a '-' */
3608 if (IS_BASE64(ch) || ch == '-') {
3609 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003610 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003611 *out++ = (char) ch;
3612 }
3613 else {
3614 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003615 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003617 else { /* not in a shift sequence */
3618 if (ch == '+') {
3619 *out++ = '+';
3620 *out++ = '-';
3621 }
3622 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3623 *out++ = (char) ch;
3624 }
3625 else {
3626 *out++ = '+';
3627 inShift = 1;
3628 goto encode_char;
3629 }
3630 }
3631 continue;
3632encode_char:
3633#ifdef Py_UNICODE_WIDE
3634 if (ch >= 0x10000) {
3635 /* code first surrogate */
3636 base64bits += 16;
3637 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3638 while (base64bits >= 6) {
3639 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3640 base64bits -= 6;
3641 }
3642 /* prepare second surrogate */
3643 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3644 }
3645#endif
3646 base64bits += 16;
3647 base64buffer = (base64buffer << 16) | ch;
3648 while (base64bits >= 6) {
3649 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3650 base64bits -= 6;
3651 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003652 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003653 if (base64bits)
3654 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3655 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003656 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003657 if (_PyBytes_Resize(&v, out - start) < 0)
3658 return NULL;
3659 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003660}
3661
Antoine Pitrou244651a2009-05-04 18:56:13 +00003662#undef IS_BASE64
3663#undef FROM_BASE64
3664#undef TO_BASE64
3665#undef DECODE_DIRECT
3666#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003667
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668/* --- UTF-8 Codec -------------------------------------------------------- */
3669
Tim Petersced69f82003-09-16 20:30:58 +00003670static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003672 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3673 illegal prefix. See RFC 3629 for details */
3674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3675 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003676 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3678 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3679 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3680 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003681 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3686 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3687 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3688 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3689 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690};
3691
Alexander Belopolsky40018472011-02-26 01:02:56 +00003692PyObject *
3693PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003694 Py_ssize_t size,
3695 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696{
Walter Dörwald69652032004-09-07 20:24:22 +00003697 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3698}
3699
Antoine Pitrouab868312009-01-10 15:40:25 +00003700/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3701#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3702
3703/* Mask to quickly check whether a C 'long' contains a
3704 non-ASCII, UTF8-encoded char. */
3705#if (SIZEOF_LONG == 8)
3706# define ASCII_CHAR_MASK 0x8080808080808080L
3707#elif (SIZEOF_LONG == 4)
3708# define ASCII_CHAR_MASK 0x80808080L
3709#else
3710# error C 'long' size should be either 4 or 8!
3711#endif
3712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713/* Scans a UTF-8 string and returns the maximum character to be expected,
3714 the size of the decoded unicode string and if any major errors were
3715 encountered.
3716
3717 This function does check basic UTF-8 sanity, it does however NOT CHECK
3718 if the string contains surrogates, and if all continuation bytes are
3719 within the correct ranges, these checks are performed in
3720 PyUnicode_DecodeUTF8Stateful.
3721
3722 If it sets has_errors to 1, it means the value of unicode_size and max_char
3723 will be bogus and you should not rely on useful information in them.
3724 */
3725static Py_UCS4
3726utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3727 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3728 int *has_errors)
3729{
3730 Py_ssize_t n;
3731 Py_ssize_t char_count = 0;
3732 Py_UCS4 max_char = 127, new_max;
3733 Py_UCS4 upper_bound;
3734 const unsigned char *p = (const unsigned char *)s;
3735 const unsigned char *end = p + string_size;
3736 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3737 int err = 0;
3738
3739 for (; p < end && !err; ++p, ++char_count) {
3740 /* Only check value if it's not a ASCII char... */
3741 if (*p < 0x80) {
3742 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3743 an explanation. */
3744 if (!((size_t) p & LONG_PTR_MASK)) {
3745 /* Help register allocation */
3746 register const unsigned char *_p = p;
3747 while (_p < aligned_end) {
3748 unsigned long value = *(unsigned long *) _p;
3749 if (value & ASCII_CHAR_MASK)
3750 break;
3751 _p += SIZEOF_LONG;
3752 char_count += SIZEOF_LONG;
3753 }
3754 p = _p;
3755 if (p == end)
3756 break;
3757 }
3758 }
3759 if (*p >= 0x80) {
3760 n = utf8_code_length[*p];
3761 new_max = max_char;
3762 switch (n) {
3763 /* invalid start byte */
3764 case 0:
3765 err = 1;
3766 break;
3767 case 2:
3768 /* Code points between 0x00FF and 0x07FF inclusive.
3769 Approximate the upper bound of the code point,
3770 if this flips over 255 we can be sure it will be more
3771 than 255 and the string will need 2 bytes per code coint,
3772 if it stays under or equal to 255, we can be sure 1 byte
3773 is enough.
3774 ((*p & 0b00011111) << 6) | 0b00111111 */
3775 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3776 if (max_char < upper_bound)
3777 new_max = upper_bound;
3778 /* Ensure we track at least that we left ASCII space. */
3779 if (new_max < 128)
3780 new_max = 128;
3781 break;
3782 case 3:
3783 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3784 always > 255 and <= 65535 and will always need 2 bytes. */
3785 if (max_char < 65535)
3786 new_max = 65535;
3787 break;
3788 case 4:
3789 /* Code point will be above 0xFFFF for sure in this case. */
3790 new_max = 65537;
3791 break;
3792 /* Internal error, this should be caught by the first if */
3793 case 1:
3794 default:
3795 assert(0 && "Impossible case in utf8_max_char_and_size");
3796 err = 1;
3797 }
3798 /* Instead of number of overall bytes for this code point,
3799 n containts the number of following bytes: */
3800 --n;
3801 /* Check if the follow up chars are all valid continuation bytes */
3802 if (n >= 1) {
3803 const unsigned char *cont;
3804 if ((p + n) >= end) {
3805 if (consumed == 0)
3806 /* incomplete data, non-incremental decoding */
3807 err = 1;
3808 break;
3809 }
3810 for (cont = p + 1; cont < (p + n); ++cont) {
3811 if ((*cont & 0xc0) != 0x80) {
3812 err = 1;
3813 break;
3814 }
3815 }
3816 p += n;
3817 }
3818 else
3819 err = 1;
3820 max_char = new_max;
3821 }
3822 }
3823
3824 if (unicode_size)
3825 *unicode_size = char_count;
3826 if (has_errors)
3827 *has_errors = err;
3828 return max_char;
3829}
3830
3831/* Similar to PyUnicode_WRITE but can also write into wstr field
3832 of the legacy unicode representation */
3833#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3834 do { \
3835 const int k_ = (kind); \
3836 if (k_ == PyUnicode_WCHAR_KIND) \
3837 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3838 else if (k_ == PyUnicode_1BYTE_KIND) \
3839 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3840 else if (k_ == PyUnicode_2BYTE_KIND) \
3841 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3842 else \
3843 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3844 } while (0)
3845
Alexander Belopolsky40018472011-02-26 01:02:56 +00003846PyObject *
3847PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 Py_ssize_t size,
3849 const char *errors,
3850 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003851{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003854 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003855 Py_ssize_t startinpos;
3856 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003857 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003859 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 PyObject *errorHandler = NULL;
3861 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862 Py_UCS4 maxchar = 0;
3863 Py_ssize_t unicode_size;
3864 Py_ssize_t i;
3865 int kind;
3866 void *data;
3867 int has_errors;
3868 Py_UNICODE *error_outptr;
3869#if SIZEOF_WCHAR_T == 2
3870 Py_ssize_t wchar_offset = 0;
3871#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872
Walter Dörwald69652032004-09-07 20:24:22 +00003873 if (size == 0) {
3874 if (consumed)
3875 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003878 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3879 consumed, &has_errors);
3880 if (has_errors) {
3881 unicode = _PyUnicode_New(size);
3882 if (!unicode)
3883 return NULL;
3884 kind = PyUnicode_WCHAR_KIND;
3885 data = PyUnicode_AS_UNICODE(unicode);
3886 assert(data != NULL);
3887 }
3888 else {
3889 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3890 if (!unicode)
3891 return NULL;
3892 /* When the string is ASCII only, just use memcpy and return.
3893 unicode_size may be != size if there is an incomplete UTF-8
3894 sequence at the end of the ASCII block. */
3895 if (maxchar < 128 && size == unicode_size) {
3896 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3897 return (PyObject *)unicode;
3898 }
3899 kind = PyUnicode_KIND(unicode);
3900 data = PyUnicode_DATA(unicode);
3901 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003905 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906
3907 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003908 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909
3910 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003911 /* Fast path for runs of ASCII characters. Given that common UTF-8
3912 input will consist of an overwhelming majority of ASCII
3913 characters, we try to optimize for this case by checking
3914 as many characters as a C 'long' can contain.
3915 First, check if we can do an aligned read, as most CPUs have
3916 a penalty for unaligned reads.
3917 */
3918 if (!((size_t) s & LONG_PTR_MASK)) {
3919 /* Help register allocation */
3920 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003921 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003922 while (_s < aligned_end) {
3923 /* Read a whole long at a time (either 4 or 8 bytes),
3924 and do a fast unrolled copy if it only contains ASCII
3925 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 unsigned long value = *(unsigned long *) _s;
3927 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003928 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3930 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3931 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3932 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003933#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3935 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3936 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3937 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003938#endif
3939 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003941 }
3942 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003944 if (s == e)
3945 break;
3946 ch = (unsigned char)*s;
3947 }
3948 }
3949
3950 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 s++;
3953 continue;
3954 }
3955
3956 n = utf8_code_length[ch];
3957
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003958 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003959 if (consumed)
3960 break;
3961 else {
3962 errmsg = "unexpected end of data";
3963 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003964 endinpos = startinpos+1;
3965 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3966 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 goto utf8Error;
3968 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003970
3971 switch (n) {
3972
3973 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003974 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 startinpos = s-starts;
3976 endinpos = startinpos+1;
3977 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978
3979 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003980 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 startinpos = s-starts;
3982 endinpos = startinpos+1;
3983 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
3985 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003986 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003987 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003989 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 goto utf8Error;
3991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00003993 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 break;
3996
3997 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00003998 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
3999 will result in surrogates in range d800-dfff. Surrogates are
4000 not valid UTF-8 so they are rejected.
4001 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4002 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004003 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004004 (s[2] & 0xc0) != 0x80 ||
4005 ((unsigned char)s[0] == 0xE0 &&
4006 (unsigned char)s[1] < 0xA0) ||
4007 ((unsigned char)s[0] == 0xED &&
4008 (unsigned char)s[1] > 0x9F)) {
4009 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004011 endinpos = startinpos + 1;
4012
4013 /* if s[1] first two bits are 1 and 0, then the invalid
4014 continuation byte is s[2], so increment endinpos by 1,
4015 if not, s[1] is invalid and endinpos doesn't need to
4016 be incremented. */
4017 if ((s[1] & 0xC0) == 0x80)
4018 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 goto utf8Error;
4020 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004022 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004024 break;
4025
4026 case 4:
4027 if ((s[1] & 0xc0) != 0x80 ||
4028 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004029 (s[3] & 0xc0) != 0x80 ||
4030 ((unsigned char)s[0] == 0xF0 &&
4031 (unsigned char)s[1] < 0x90) ||
4032 ((unsigned char)s[0] == 0xF4 &&
4033 (unsigned char)s[1] > 0x8F)) {
4034 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004036 endinpos = startinpos + 1;
4037 if ((s[1] & 0xC0) == 0x80) {
4038 endinpos++;
4039 if ((s[2] & 0xC0) == 0x80)
4040 endinpos++;
4041 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 goto utf8Error;
4043 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004044 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004045 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4046 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 /* If the string is flexible or we have native UCS-4, write
4049 directly.. */
4050 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4051 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 else {
4054 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 /* translate from 10000..10FFFF to 0..FFFF */
4057 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004059 /* high surrogate = top 10 bits added to D800 */
4060 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4061 (Py_UNICODE)(0xD800 + (ch >> 10)));
4062
4063 /* low surrogate = bottom 10 bits added to DC00 */
4064 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4065 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4066 }
4067#if SIZEOF_WCHAR_T == 2
4068 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004069#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 }
4072 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004074
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 /* If this is not yet a resizable string, make it one.. */
4077 if (kind != PyUnicode_WCHAR_KIND) {
4078 const Py_UNICODE *u;
4079 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4080 if (!new_unicode)
4081 goto onError;
4082 u = PyUnicode_AsUnicode((PyObject *)unicode);
4083 if (!u)
4084 goto onError;
4085#if SIZEOF_WCHAR_T == 2
4086 i += wchar_offset;
4087#endif
4088 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4089 Py_DECREF(unicode);
4090 unicode = new_unicode;
4091 kind = 0;
4092 data = PyUnicode_AS_UNICODE(new_unicode);
4093 assert(data != NULL);
4094 }
4095 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004096 if (unicode_decode_call_errorhandler(
4097 errors, &errorHandler,
4098 "utf8", errmsg,
4099 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 /* Update data because unicode_decode_call_errorhandler might have
4103 re-created or resized the unicode object. */
4104 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004105 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107 /* Ensure the unicode_size calculation above was correct: */
4108 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4109
Walter Dörwald69652032004-09-07 20:24:22 +00004110 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004113 /* Adjust length and ready string when it contained errors and
4114 is of the old resizable kind. */
4115 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004116 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117 PyUnicode_READY(unicode) == -1)
4118 goto onError;
4119 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004121 Py_XDECREF(errorHandler);
4122 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123 if (PyUnicode_READY(unicode) == -1) {
4124 Py_DECREF(unicode);
4125 return NULL;
4126 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 return (PyObject *)unicode;
4128
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 Py_XDECREF(errorHandler);
4131 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 Py_DECREF(unicode);
4133 return NULL;
4134}
4135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004137
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004138#ifdef __APPLE__
4139
4140/* Simplified UTF-8 decoder using surrogateescape error handler,
4141 used to decode the command line arguments on Mac OS X. */
4142
4143wchar_t*
4144_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4145{
4146 int n;
4147 const char *e;
4148 wchar_t *unicode, *p;
4149
4150 /* Note: size will always be longer than the resulting Unicode
4151 character count */
4152 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4153 PyErr_NoMemory();
4154 return NULL;
4155 }
4156 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4157 if (!unicode)
4158 return NULL;
4159
4160 /* Unpack UTF-8 encoded data */
4161 p = unicode;
4162 e = s + size;
4163 while (s < e) {
4164 Py_UCS4 ch = (unsigned char)*s;
4165
4166 if (ch < 0x80) {
4167 *p++ = (wchar_t)ch;
4168 s++;
4169 continue;
4170 }
4171
4172 n = utf8_code_length[ch];
4173 if (s + n > e) {
4174 goto surrogateescape;
4175 }
4176
4177 switch (n) {
4178 case 0:
4179 case 1:
4180 goto surrogateescape;
4181
4182 case 2:
4183 if ((s[1] & 0xc0) != 0x80)
4184 goto surrogateescape;
4185 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4186 assert ((ch > 0x007F) && (ch <= 0x07FF));
4187 *p++ = (wchar_t)ch;
4188 break;
4189
4190 case 3:
4191 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4192 will result in surrogates in range d800-dfff. Surrogates are
4193 not valid UTF-8 so they are rejected.
4194 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4195 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4196 if ((s[1] & 0xc0) != 0x80 ||
4197 (s[2] & 0xc0) != 0x80 ||
4198 ((unsigned char)s[0] == 0xE0 &&
4199 (unsigned char)s[1] < 0xA0) ||
4200 ((unsigned char)s[0] == 0xED &&
4201 (unsigned char)s[1] > 0x9F)) {
4202
4203 goto surrogateescape;
4204 }
4205 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4206 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004208 break;
4209
4210 case 4:
4211 if ((s[1] & 0xc0) != 0x80 ||
4212 (s[2] & 0xc0) != 0x80 ||
4213 (s[3] & 0xc0) != 0x80 ||
4214 ((unsigned char)s[0] == 0xF0 &&
4215 (unsigned char)s[1] < 0x90) ||
4216 ((unsigned char)s[0] == 0xF4 &&
4217 (unsigned char)s[1] > 0x8F)) {
4218 goto surrogateescape;
4219 }
4220 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4221 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4222 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4223
4224#if SIZEOF_WCHAR_T == 4
4225 *p++ = (wchar_t)ch;
4226#else
4227 /* compute and append the two surrogates: */
4228
4229 /* translate from 10000..10FFFF to 0..FFFF */
4230 ch -= 0x10000;
4231
4232 /* high surrogate = top 10 bits added to D800 */
4233 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4234
4235 /* low surrogate = bottom 10 bits added to DC00 */
4236 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4237#endif
4238 break;
4239 }
4240 s += n;
4241 continue;
4242
4243 surrogateescape:
4244 *p++ = 0xDC00 + ch;
4245 s++;
4246 }
4247 *p = L'\0';
4248 return unicode;
4249}
4250
4251#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004253/* Primary internal function which creates utf8 encoded bytes objects.
4254
4255 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004256 and allocate exactly as much space needed at the end. Else allocate the
4257 maximum possible needed (4 result bytes per Unicode character), and return
4258 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004259*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004260PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004261_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262{
Tim Peters602f7402002-04-27 18:03:26 +00004263#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004264
Guido van Rossum98297ee2007-11-06 21:34:58 +00004265 Py_ssize_t i; /* index into s of next input byte */
4266 PyObject *result; /* result string object */
4267 char *p; /* next free byte in output buffer */
4268 Py_ssize_t nallocated; /* number of result bytes allocated */
4269 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004270 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004271 PyObject *errorHandler = NULL;
4272 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273 int kind;
4274 void *data;
4275 Py_ssize_t size;
4276 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4277#if SIZEOF_WCHAR_T == 2
4278 Py_ssize_t wchar_offset = 0;
4279#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 if (!PyUnicode_Check(unicode)) {
4282 PyErr_BadArgument();
4283 return NULL;
4284 }
4285
4286 if (PyUnicode_READY(unicode) == -1)
4287 return NULL;
4288
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004289 if (PyUnicode_UTF8(unicode))
4290 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4291 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004292
4293 kind = PyUnicode_KIND(unicode);
4294 data = PyUnicode_DATA(unicode);
4295 size = PyUnicode_GET_LENGTH(unicode);
4296
Tim Peters602f7402002-04-27 18:03:26 +00004297 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298
Tim Peters602f7402002-04-27 18:03:26 +00004299 if (size <= MAX_SHORT_UNICHARS) {
4300 /* Write into the stack buffer; nallocated can't overflow.
4301 * At the end, we'll allocate exactly as much heap space as it
4302 * turns out we need.
4303 */
4304 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004305 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004306 p = stackbuf;
4307 }
4308 else {
4309 /* Overallocate on the heap, and give the excess back at the end. */
4310 nallocated = size * 4;
4311 if (nallocated / 4 != size) /* overflow! */
4312 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004313 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004314 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004315 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004316 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004317 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004318
Tim Peters602f7402002-04-27 18:03:26 +00004319 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004320 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004321
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004322 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004323 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004325
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004327 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004328 *p++ = (char)(0xc0 | (ch >> 6));
4329 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004330 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 Py_ssize_t newpos;
4332 PyObject *rep;
4333 Py_ssize_t repsize, k, startpos;
4334 startpos = i-1;
4335#if SIZEOF_WCHAR_T == 2
4336 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004337#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338 rep = unicode_encode_call_errorhandler(
4339 errors, &errorHandler, "utf-8", "surrogates not allowed",
4340 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4341 &exc, startpos, startpos+1, &newpos);
4342 if (!rep)
4343 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345 if (PyBytes_Check(rep))
4346 repsize = PyBytes_GET_SIZE(rep);
4347 else
4348 repsize = PyUnicode_GET_SIZE(rep);
4349
4350 if (repsize > 4) {
4351 Py_ssize_t offset;
4352
4353 if (result == NULL)
4354 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004355 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004356 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4359 /* integer overflow */
4360 PyErr_NoMemory();
4361 goto error;
4362 }
4363 nallocated += repsize - 4;
4364 if (result != NULL) {
4365 if (_PyBytes_Resize(&result, nallocated) < 0)
4366 goto error;
4367 } else {
4368 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004369 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004370 goto error;
4371 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4372 }
4373 p = PyBytes_AS_STRING(result) + offset;
4374 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004376 if (PyBytes_Check(rep)) {
4377 char *prep = PyBytes_AS_STRING(rep);
4378 for(k = repsize; k > 0; k--)
4379 *p++ = *prep++;
4380 } else /* rep is unicode */ {
4381 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4382 Py_UNICODE c;
4383
4384 for(k=0; k<repsize; k++) {
4385 c = prep[k];
4386 if (0x80 <= c) {
4387 raise_encode_exception(&exc, "utf-8",
4388 PyUnicode_AS_UNICODE(unicode),
4389 size, i-1, i,
4390 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004391 goto error;
4392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004393 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004394 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004396 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004397 } else if (ch < 0x10000) {
4398 *p++ = (char)(0xe0 | (ch >> 12));
4399 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4400 *p++ = (char)(0x80 | (ch & 0x3f));
4401 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004402 /* Encode UCS4 Unicode ordinals */
4403 *p++ = (char)(0xf0 | (ch >> 18));
4404 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4405 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4406 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004407#if SIZEOF_WCHAR_T == 2
4408 wchar_offset++;
4409#endif
Tim Peters602f7402002-04-27 18:03:26 +00004410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004412
Guido van Rossum98297ee2007-11-06 21:34:58 +00004413 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004414 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004415 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004416 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004417 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004418 }
4419 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004420 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004421 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004422 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004423 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004425
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004426 Py_XDECREF(errorHandler);
4427 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004428 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004429 error:
4430 Py_XDECREF(errorHandler);
4431 Py_XDECREF(exc);
4432 Py_XDECREF(result);
4433 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004434
Tim Peters602f7402002-04-27 18:03:26 +00004435#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436}
4437
Alexander Belopolsky40018472011-02-26 01:02:56 +00004438PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004439PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4440 Py_ssize_t size,
4441 const char *errors)
4442{
4443 PyObject *v, *unicode;
4444
4445 unicode = PyUnicode_FromUnicode(s, size);
4446 if (unicode == NULL)
4447 return NULL;
4448 v = _PyUnicode_AsUTF8String(unicode, errors);
4449 Py_DECREF(unicode);
4450 return v;
4451}
4452
4453PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004454PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004456 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457}
4458
Walter Dörwald41980ca2007-08-16 21:55:45 +00004459/* --- UTF-32 Codec ------------------------------------------------------- */
4460
4461PyObject *
4462PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004463 Py_ssize_t size,
4464 const char *errors,
4465 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004466{
4467 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4468}
4469
4470PyObject *
4471PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 Py_ssize_t size,
4473 const char *errors,
4474 int *byteorder,
4475 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004476{
4477 const char *starts = s;
4478 Py_ssize_t startinpos;
4479 Py_ssize_t endinpos;
4480 Py_ssize_t outpos;
4481 PyUnicodeObject *unicode;
4482 Py_UNICODE *p;
4483#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004484 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004485 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004486#else
4487 const int pairs = 0;
4488#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004489 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004490 int bo = 0; /* assume native ordering by default */
4491 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004492 /* Offsets from q for retrieving bytes in the right order. */
4493#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4494 int iorder[] = {0, 1, 2, 3};
4495#else
4496 int iorder[] = {3, 2, 1, 0};
4497#endif
4498 PyObject *errorHandler = NULL;
4499 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004500
Walter Dörwald41980ca2007-08-16 21:55:45 +00004501 q = (unsigned char *)s;
4502 e = q + size;
4503
4504 if (byteorder)
4505 bo = *byteorder;
4506
4507 /* Check for BOM marks (U+FEFF) in the input and adjust current
4508 byte order setting accordingly. In native mode, the leading BOM
4509 mark is skipped, in all other modes, it is copied to the output
4510 stream as-is (giving a ZWNBSP character). */
4511 if (bo == 0) {
4512 if (size >= 4) {
4513 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004515#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 if (bom == 0x0000FEFF) {
4517 q += 4;
4518 bo = -1;
4519 }
4520 else if (bom == 0xFFFE0000) {
4521 q += 4;
4522 bo = 1;
4523 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004524#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 if (bom == 0x0000FEFF) {
4526 q += 4;
4527 bo = 1;
4528 }
4529 else if (bom == 0xFFFE0000) {
4530 q += 4;
4531 bo = -1;
4532 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004533#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004535 }
4536
4537 if (bo == -1) {
4538 /* force LE */
4539 iorder[0] = 0;
4540 iorder[1] = 1;
4541 iorder[2] = 2;
4542 iorder[3] = 3;
4543 }
4544 else if (bo == 1) {
4545 /* force BE */
4546 iorder[0] = 3;
4547 iorder[1] = 2;
4548 iorder[2] = 1;
4549 iorder[3] = 0;
4550 }
4551
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004552 /* On narrow builds we split characters outside the BMP into two
4553 codepoints => count how much extra space we need. */
4554#ifndef Py_UNICODE_WIDE
4555 for (qq = q; qq < e; qq += 4)
4556 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4557 pairs++;
4558#endif
4559
4560 /* This might be one to much, because of a BOM */
4561 unicode = _PyUnicode_New((size+3)/4+pairs);
4562 if (!unicode)
4563 return NULL;
4564 if (size == 0)
4565 return (PyObject *)unicode;
4566
4567 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004568 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004569
Walter Dörwald41980ca2007-08-16 21:55:45 +00004570 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 Py_UCS4 ch;
4572 /* remaining bytes at the end? (size should be divisible by 4) */
4573 if (e-q<4) {
4574 if (consumed)
4575 break;
4576 errmsg = "truncated data";
4577 startinpos = ((const char *)q)-starts;
4578 endinpos = ((const char *)e)-starts;
4579 goto utf32Error;
4580 /* The remaining input chars are ignored if the callback
4581 chooses to skip the input */
4582 }
4583 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4584 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004585
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 if (ch >= 0x110000)
4587 {
4588 errmsg = "codepoint not in range(0x110000)";
4589 startinpos = ((const char *)q)-starts;
4590 endinpos = startinpos+4;
4591 goto utf32Error;
4592 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004593#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 if (ch >= 0x10000)
4595 {
4596 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4597 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4598 }
4599 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004600#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 *p++ = ch;
4602 q += 4;
4603 continue;
4604 utf32Error:
4605 outpos = p-PyUnicode_AS_UNICODE(unicode);
4606 if (unicode_decode_call_errorhandler(
4607 errors, &errorHandler,
4608 "utf32", errmsg,
4609 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4610 &unicode, &outpos, &p))
4611 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004612 }
4613
4614 if (byteorder)
4615 *byteorder = bo;
4616
4617 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004619
4620 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004621 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004622 goto onError;
4623
4624 Py_XDECREF(errorHandler);
4625 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 if (PyUnicode_READY(unicode) == -1) {
4627 Py_DECREF(unicode);
4628 return NULL;
4629 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004630 return (PyObject *)unicode;
4631
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004633 Py_DECREF(unicode);
4634 Py_XDECREF(errorHandler);
4635 Py_XDECREF(exc);
4636 return NULL;
4637}
4638
4639PyObject *
4640PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 Py_ssize_t size,
4642 const char *errors,
4643 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004644{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004645 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004646 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004647 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004649 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004650#else
4651 const int pairs = 0;
4652#endif
4653 /* Offsets from p for storing byte pairs in the right order. */
4654#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4655 int iorder[] = {0, 1, 2, 3};
4656#else
4657 int iorder[] = {3, 2, 1, 0};
4658#endif
4659
Benjamin Peterson29060642009-01-31 22:14:21 +00004660#define STORECHAR(CH) \
4661 do { \
4662 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4663 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4664 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4665 p[iorder[0]] = (CH) & 0xff; \
4666 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004667 } while(0)
4668
4669 /* In narrow builds we can output surrogate pairs as one codepoint,
4670 so we need less space. */
4671#ifndef Py_UNICODE_WIDE
4672 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004673 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4674 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4675 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004676#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004677 nsize = (size - pairs + (byteorder == 0));
4678 bytesize = nsize * 4;
4679 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004681 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004682 if (v == NULL)
4683 return NULL;
4684
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004685 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004686 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004687 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004688 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004689 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004690
4691 if (byteorder == -1) {
4692 /* force LE */
4693 iorder[0] = 0;
4694 iorder[1] = 1;
4695 iorder[2] = 2;
4696 iorder[3] = 3;
4697 }
4698 else if (byteorder == 1) {
4699 /* force BE */
4700 iorder[0] = 3;
4701 iorder[1] = 2;
4702 iorder[2] = 1;
4703 iorder[3] = 0;
4704 }
4705
4706 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004708#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4710 Py_UCS4 ch2 = *s;
4711 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4712 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4713 s++;
4714 size--;
4715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004716 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004717#endif
4718 STORECHAR(ch);
4719 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004720
4721 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004722 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723#undef STORECHAR
4724}
4725
Alexander Belopolsky40018472011-02-26 01:02:56 +00004726PyObject *
4727PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004728{
4729 if (!PyUnicode_Check(unicode)) {
4730 PyErr_BadArgument();
4731 return NULL;
4732 }
4733 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 PyUnicode_GET_SIZE(unicode),
4735 NULL,
4736 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004737}
4738
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739/* --- UTF-16 Codec ------------------------------------------------------- */
4740
Tim Peters772747b2001-08-09 22:21:55 +00004741PyObject *
4742PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 Py_ssize_t size,
4744 const char *errors,
4745 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746{
Walter Dörwald69652032004-09-07 20:24:22 +00004747 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4748}
4749
Antoine Pitrouab868312009-01-10 15:40:25 +00004750/* Two masks for fast checking of whether a C 'long' may contain
4751 UTF16-encoded surrogate characters. This is an efficient heuristic,
4752 assuming that non-surrogate characters with a code point >= 0x8000 are
4753 rare in most input.
4754 FAST_CHAR_MASK is used when the input is in native byte ordering,
4755 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004756*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004757#if (SIZEOF_LONG == 8)
4758# define FAST_CHAR_MASK 0x8000800080008000L
4759# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4760#elif (SIZEOF_LONG == 4)
4761# define FAST_CHAR_MASK 0x80008000L
4762# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4763#else
4764# error C 'long' size should be either 4 or 8!
4765#endif
4766
Walter Dörwald69652032004-09-07 20:24:22 +00004767PyObject *
4768PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004769 Py_ssize_t size,
4770 const char *errors,
4771 int *byteorder,
4772 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004775 Py_ssize_t startinpos;
4776 Py_ssize_t endinpos;
4777 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 PyUnicodeObject *unicode;
4779 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004780 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004781 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004782 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004783 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004784 /* Offsets from q for retrieving byte pairs in the right order. */
4785#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4786 int ihi = 1, ilo = 0;
4787#else
4788 int ihi = 0, ilo = 1;
4789#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 PyObject *errorHandler = NULL;
4791 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792
4793 /* Note: size will always be longer than the resulting Unicode
4794 character count */
4795 unicode = _PyUnicode_New(size);
4796 if (!unicode)
4797 return NULL;
4798 if (size == 0)
4799 return (PyObject *)unicode;
4800
4801 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004802 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004803 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004804 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004805
4806 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004807 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004809 /* Check for BOM marks (U+FEFF) in the input and adjust current
4810 byte order setting accordingly. In native mode, the leading BOM
4811 mark is skipped, in all other modes, it is copied to the output
4812 stream as-is (giving a ZWNBSP character). */
4813 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004814 if (size >= 2) {
4815 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004816#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 if (bom == 0xFEFF) {
4818 q += 2;
4819 bo = -1;
4820 }
4821 else if (bom == 0xFFFE) {
4822 q += 2;
4823 bo = 1;
4824 }
Tim Petersced69f82003-09-16 20:30:58 +00004825#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 if (bom == 0xFEFF) {
4827 q += 2;
4828 bo = 1;
4829 }
4830 else if (bom == 0xFFFE) {
4831 q += 2;
4832 bo = -1;
4833 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004834#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004836 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837
Tim Peters772747b2001-08-09 22:21:55 +00004838 if (bo == -1) {
4839 /* force LE */
4840 ihi = 1;
4841 ilo = 0;
4842 }
4843 else if (bo == 1) {
4844 /* force BE */
4845 ihi = 0;
4846 ilo = 1;
4847 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004848#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4849 native_ordering = ilo < ihi;
4850#else
4851 native_ordering = ilo > ihi;
4852#endif
Tim Peters772747b2001-08-09 22:21:55 +00004853
Antoine Pitrouab868312009-01-10 15:40:25 +00004854 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004855 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004857 /* First check for possible aligned read of a C 'long'. Unaligned
4858 reads are more expensive, better to defer to another iteration. */
4859 if (!((size_t) q & LONG_PTR_MASK)) {
4860 /* Fast path for runs of non-surrogate chars. */
4861 register const unsigned char *_q = q;
4862 Py_UNICODE *_p = p;
4863 if (native_ordering) {
4864 /* Native ordering is simple: as long as the input cannot
4865 possibly contain a surrogate char, do an unrolled copy
4866 of several 16-bit code points to the target object.
4867 The non-surrogate check is done on several input bytes
4868 at a time (as many as a C 'long' can contain). */
4869 while (_q < aligned_end) {
4870 unsigned long data = * (unsigned long *) _q;
4871 if (data & FAST_CHAR_MASK)
4872 break;
4873 _p[0] = ((unsigned short *) _q)[0];
4874 _p[1] = ((unsigned short *) _q)[1];
4875#if (SIZEOF_LONG == 8)
4876 _p[2] = ((unsigned short *) _q)[2];
4877 _p[3] = ((unsigned short *) _q)[3];
4878#endif
4879 _q += SIZEOF_LONG;
4880 _p += SIZEOF_LONG / 2;
4881 }
4882 }
4883 else {
4884 /* Byteswapped ordering is similar, but we must decompose
4885 the copy bytewise, and take care of zero'ing out the
4886 upper bytes if the target object is in 32-bit units
4887 (that is, in UCS-4 builds). */
4888 while (_q < aligned_end) {
4889 unsigned long data = * (unsigned long *) _q;
4890 if (data & SWAPPED_FAST_CHAR_MASK)
4891 break;
4892 /* Zero upper bytes in UCS-4 builds */
4893#if (Py_UNICODE_SIZE > 2)
4894 _p[0] = 0;
4895 _p[1] = 0;
4896#if (SIZEOF_LONG == 8)
4897 _p[2] = 0;
4898 _p[3] = 0;
4899#endif
4900#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004901 /* Issue #4916; UCS-4 builds on big endian machines must
4902 fill the two last bytes of each 4-byte unit. */
4903#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4904# define OFF 2
4905#else
4906# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004907#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004908 ((unsigned char *) _p)[OFF + 1] = _q[0];
4909 ((unsigned char *) _p)[OFF + 0] = _q[1];
4910 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4911 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4912#if (SIZEOF_LONG == 8)
4913 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4914 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4915 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4916 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4917#endif
4918#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004919 _q += SIZEOF_LONG;
4920 _p += SIZEOF_LONG / 2;
4921 }
4922 }
4923 p = _p;
4924 q = _q;
4925 if (q >= e)
4926 break;
4927 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004929
Benjamin Peterson14339b62009-01-31 16:36:08 +00004930 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004931
4932 if (ch < 0xD800 || ch > 0xDFFF) {
4933 *p++ = ch;
4934 continue;
4935 }
4936
4937 /* UTF-16 code pair: */
4938 if (q > e) {
4939 errmsg = "unexpected end of data";
4940 startinpos = (((const char *)q) - 2) - starts;
4941 endinpos = ((const char *)e) + 1 - starts;
4942 goto utf16Error;
4943 }
4944 if (0xD800 <= ch && ch <= 0xDBFF) {
4945 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4946 q += 2;
4947 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004948#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 *p++ = ch;
4950 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004951#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004953#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 continue;
4955 }
4956 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004957 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 startinpos = (((const char *)q)-4)-starts;
4959 endinpos = startinpos+2;
4960 goto utf16Error;
4961 }
4962
Benjamin Peterson14339b62009-01-31 16:36:08 +00004963 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 errmsg = "illegal encoding";
4965 startinpos = (((const char *)q)-2)-starts;
4966 endinpos = startinpos+2;
4967 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004968
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 utf16Error:
4970 outpos = p - PyUnicode_AS_UNICODE(unicode);
4971 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004972 errors,
4973 &errorHandler,
4974 "utf16", errmsg,
4975 &starts,
4976 (const char **)&e,
4977 &startinpos,
4978 &endinpos,
4979 &exc,
4980 (const char **)&q,
4981 &unicode,
4982 &outpos,
4983 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004986 /* remaining byte at the end? (size should be even) */
4987 if (e == q) {
4988 if (!consumed) {
4989 errmsg = "truncated data";
4990 startinpos = ((const char *)q) - starts;
4991 endinpos = ((const char *)e) + 1 - starts;
4992 outpos = p - PyUnicode_AS_UNICODE(unicode);
4993 if (unicode_decode_call_errorhandler(
4994 errors,
4995 &errorHandler,
4996 "utf16", errmsg,
4997 &starts,
4998 (const char **)&e,
4999 &startinpos,
5000 &endinpos,
5001 &exc,
5002 (const char **)&q,
5003 &unicode,
5004 &outpos,
5005 &p))
5006 goto onError;
5007 /* The remaining input chars are ignored if the callback
5008 chooses to skip the input */
5009 }
5010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005011
5012 if (byteorder)
5013 *byteorder = bo;
5014
Walter Dörwald69652032004-09-07 20:24:22 +00005015 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005017
Guido van Rossumd57fd912000-03-10 22:53:23 +00005018 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005019 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 goto onError;
5021
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005022 Py_XDECREF(errorHandler);
5023 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005024 if (PyUnicode_READY(unicode) == -1) {
5025 Py_DECREF(unicode);
5026 return NULL;
5027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028 return (PyObject *)unicode;
5029
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005032 Py_XDECREF(errorHandler);
5033 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 return NULL;
5035}
5036
Antoine Pitrouab868312009-01-10 15:40:25 +00005037#undef FAST_CHAR_MASK
5038#undef SWAPPED_FAST_CHAR_MASK
5039
Tim Peters772747b2001-08-09 22:21:55 +00005040PyObject *
5041PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 Py_ssize_t size,
5043 const char *errors,
5044 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005045{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005046 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005047 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005048 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005049#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005050 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005051#else
5052 const int pairs = 0;
5053#endif
Tim Peters772747b2001-08-09 22:21:55 +00005054 /* Offsets from p for storing byte pairs in the right order. */
5055#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5056 int ihi = 1, ilo = 0;
5057#else
5058 int ihi = 0, ilo = 1;
5059#endif
5060
Benjamin Peterson29060642009-01-31 22:14:21 +00005061#define STORECHAR(CH) \
5062 do { \
5063 p[ihi] = ((CH) >> 8) & 0xff; \
5064 p[ilo] = (CH) & 0xff; \
5065 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005066 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005068#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005069 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 if (s[i] >= 0x10000)
5071 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005072#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005073 /* 2 * (size + pairs + (byteorder == 0)) */
5074 if (size > PY_SSIZE_T_MAX ||
5075 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005077 nsize = size + pairs + (byteorder == 0);
5078 bytesize = nsize * 2;
5079 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005081 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 if (v == NULL)
5083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005085 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005088 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005089 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005090
5091 if (byteorder == -1) {
5092 /* force LE */
5093 ihi = 1;
5094 ilo = 0;
5095 }
5096 else if (byteorder == 1) {
5097 /* force BE */
5098 ihi = 0;
5099 ilo = 1;
5100 }
5101
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005102 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 Py_UNICODE ch = *s++;
5104 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005105#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 if (ch >= 0x10000) {
5107 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5108 ch = 0xD800 | ((ch-0x10000) >> 10);
5109 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005110#endif
Tim Peters772747b2001-08-09 22:21:55 +00005111 STORECHAR(ch);
5112 if (ch2)
5113 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005114 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005115
5116 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005118#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119}
5120
Alexander Belopolsky40018472011-02-26 01:02:56 +00005121PyObject *
5122PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123{
5124 if (!PyUnicode_Check(unicode)) {
5125 PyErr_BadArgument();
5126 return NULL;
5127 }
5128 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 PyUnicode_GET_SIZE(unicode),
5130 NULL,
5131 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132}
5133
5134/* --- Unicode Escape Codec ----------------------------------------------- */
5135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005136/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5137 if all the escapes in the string make it still a valid ASCII string.
5138 Returns -1 if any escapes were found which cause the string to
5139 pop out of ASCII range. Otherwise returns the length of the
5140 required buffer to hold the string.
5141 */
5142Py_ssize_t
5143length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5144{
5145 const unsigned char *p = (const unsigned char *)s;
5146 const unsigned char *end = p + size;
5147 Py_ssize_t length = 0;
5148
5149 if (size < 0)
5150 return -1;
5151
5152 for (; p < end; ++p) {
5153 if (*p > 127) {
5154 /* Non-ASCII */
5155 return -1;
5156 }
5157 else if (*p != '\\') {
5158 /* Normal character */
5159 ++length;
5160 }
5161 else {
5162 /* Backslash-escape, check next char */
5163 ++p;
5164 /* Escape sequence reaches till end of string or
5165 non-ASCII follow-up. */
5166 if (p >= end || *p > 127)
5167 return -1;
5168 switch (*p) {
5169 case '\n':
5170 /* backslash + \n result in zero characters */
5171 break;
5172 case '\\': case '\'': case '\"':
5173 case 'b': case 'f': case 't':
5174 case 'n': case 'r': case 'v': case 'a':
5175 ++length;
5176 break;
5177 case '0': case '1': case '2': case '3':
5178 case '4': case '5': case '6': case '7':
5179 case 'x': case 'u': case 'U': case 'N':
5180 /* these do not guarantee ASCII characters */
5181 return -1;
5182 default:
5183 /* count the backslash + the other character */
5184 length += 2;
5185 }
5186 }
5187 }
5188 return length;
5189}
5190
5191/* Similar to PyUnicode_WRITE but either write into wstr field
5192 or treat string as ASCII. */
5193#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5194 do { \
5195 if ((kind) != PyUnicode_WCHAR_KIND) \
5196 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5197 else \
5198 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5199 } while (0)
5200
5201#define WRITE_WSTR(buf, index, value) \
5202 assert(kind == PyUnicode_WCHAR_KIND), \
5203 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5204
5205
Fredrik Lundh06d12682001-01-24 07:59:11 +00005206static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005207
Alexander Belopolsky40018472011-02-26 01:02:56 +00005208PyObject *
5209PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005210 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005211 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005214 Py_ssize_t startinpos;
5215 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005216 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005220 char* message;
5221 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 PyObject *errorHandler = NULL;
5223 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005224 Py_ssize_t ascii_length;
5225 Py_ssize_t i;
5226 int kind;
5227 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005229 ascii_length = length_of_escaped_ascii_string(s, size);
5230
5231 /* After length_of_escaped_ascii_string() there are two alternatives,
5232 either the string is pure ASCII with named escapes like \n, etc.
5233 and we determined it's exact size (common case)
5234 or it contains \x, \u, ... escape sequences. then we create a
5235 legacy wchar string and resize it at the end of this function. */
5236 if (ascii_length >= 0) {
5237 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5238 if (!v)
5239 goto onError;
5240 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5241 kind = PyUnicode_1BYTE_KIND;
5242 data = PyUnicode_DATA(v);
5243 }
5244 else {
5245 /* Escaped strings will always be longer than the resulting
5246 Unicode string, so we start with size here and then reduce the
5247 length after conversion to the true value.
5248 (but if the error callback returns a long replacement string
5249 we'll have to allocate more space) */
5250 v = _PyUnicode_New(size);
5251 if (!v)
5252 goto onError;
5253 kind = PyUnicode_WCHAR_KIND;
5254 data = PyUnicode_AS_UNICODE(v);
5255 }
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 if (size == 0)
5258 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005259 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005261
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 while (s < end) {
5263 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005264 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005267 if (kind == PyUnicode_WCHAR_KIND) {
5268 assert(i < _PyUnicode_WSTR_LENGTH(v));
5269 }
5270 else {
5271 /* The only case in which i == ascii_length is a backslash
5272 followed by a newline. */
5273 assert(i <= ascii_length);
5274 }
5275
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 /* Non-escape characters are interpreted as Unicode ordinals */
5277 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005278 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 continue;
5280 }
5281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005282 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 /* \ - Escapes */
5284 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005285 c = *s++;
5286 if (s > end)
5287 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005288
5289 if (kind == PyUnicode_WCHAR_KIND) {
5290 assert(i < _PyUnicode_WSTR_LENGTH(v));
5291 }
5292 else {
5293 /* The only case in which i == ascii_length is a backslash
5294 followed by a newline. */
5295 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5296 }
5297
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005298 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005302 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5303 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5304 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5305 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5306 /* FF */
5307 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5308 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5309 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5310 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5311 /* VT */
5312 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5313 /* BEL, not classic C */
5314 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 case '0': case '1': case '2': case '3':
5318 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005319 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005320 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005321 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005322 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005323 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005325 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 break;
5327
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 /* hex escapes */
5329 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005331 digits = 2;
5332 message = "truncated \\xXX escape";
5333 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334
Benjamin Peterson29060642009-01-31 22:14:21 +00005335 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005337 digits = 4;
5338 message = "truncated \\uXXXX escape";
5339 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005342 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005343 digits = 8;
5344 message = "truncated \\UXXXXXXXX escape";
5345 hexescape:
5346 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005347 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 if (s+digits>end) {
5349 endinpos = size;
5350 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 errors, &errorHandler,
5352 "unicodeescape", "end of string in escape sequence",
5353 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005354 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005356 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 goto nextByte;
5358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359 for (j = 0; j < digits; ++j) {
5360 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005361 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362 endinpos = (s+j+1)-starts;
5363 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005364 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 errors, &errorHandler,
5366 "unicodeescape", message,
5367 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005368 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005369 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005370 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005372 }
5373 chr = (chr<<4) & ~0xF;
5374 if (c >= '0' && c <= '9')
5375 chr += c - '0';
5376 else if (c >= 'a' && c <= 'f')
5377 chr += 10 + c - 'a';
5378 else
5379 chr += 10 + c - 'A';
5380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005381 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005382 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 /* _decoding_error will have already written into the
5384 target buffer. */
5385 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005386 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005387 /* when we get here, chr is a 32-bit unicode character */
5388 if (chr <= 0xffff)
5389 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005390 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005391 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005392 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005393 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005394#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005395 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005396#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005397 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005398 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5399 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005400#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005401 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005402 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005403 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005404 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 errors, &errorHandler,
5406 "unicodeescape", "illegal Unicode character",
5407 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005409 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005410 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005411 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005412 break;
5413
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005415 case 'N':
5416 message = "malformed \\N character escape";
5417 if (ucnhash_CAPI == NULL) {
5418 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005419 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5420 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005421 if (ucnhash_CAPI == NULL)
5422 goto ucnhashError;
5423 }
5424 if (*s == '{') {
5425 const char *start = s+1;
5426 /* look for the closing brace */
5427 while (*s != '}' && s < end)
5428 s++;
5429 if (s > start && s < end && *s == '}') {
5430 /* found a name. look it up in the unicode database */
5431 message = "unknown Unicode character name";
5432 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5434 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005435 goto store;
5436 }
5437 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005441 errors, &errorHandler,
5442 "unicodeescape", message,
5443 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005445 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005446 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005447 break;
5448
5449 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005450 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005451 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 message = "\\ at end of string";
5453 s--;
5454 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005455 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005456 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 errors, &errorHandler,
5458 "unicodeescape", message,
5459 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005460 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005461 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005463 }
5464 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5466 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005467 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005468 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005473 /* Ensure the length prediction worked in case of ASCII strings */
5474 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5475
Victor Stinnerfe226c02011-10-03 03:52:20 +02005476 if (kind == PyUnicode_WCHAR_KIND)
5477 {
5478 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5479 goto onError;
5480 if (PyUnicode_READY(v) == -1)
5481 goto onError;
5482 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005483 Py_XDECREF(errorHandler);
5484 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005486
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005488 PyErr_SetString(
5489 PyExc_UnicodeError,
5490 "\\N escapes not supported (can't load unicodedata module)"
5491 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005492 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005493 Py_XDECREF(errorHandler);
5494 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005495 return NULL;
5496
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 Py_XDECREF(errorHandler);
5500 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 return NULL;
5502}
5503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504#undef WRITE_ASCII_OR_WSTR
5505#undef WRITE_WSTR
5506
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507/* Return a Unicode-Escape string version of the Unicode object.
5508
5509 If quotes is true, the string is enclosed in u"" or u'' quotes as
5510 appropriate.
5511
5512*/
5513
Walter Dörwald79e913e2007-05-12 11:08:06 +00005514static const char *hexdigits = "0123456789abcdef";
5515
Alexander Belopolsky40018472011-02-26 01:02:56 +00005516PyObject *
5517PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005518 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005523#ifdef Py_UNICODE_WIDE
5524 const Py_ssize_t expandsize = 10;
5525#else
5526 const Py_ssize_t expandsize = 6;
5527#endif
5528
Thomas Wouters89f507f2006-12-13 04:49:30 +00005529 /* XXX(nnorwitz): rather than over-allocating, it would be
5530 better to choose a different scheme. Perhaps scan the
5531 first N-chars of the string and allocate based on that size.
5532 */
5533 /* Initial allocation is based on the longest-possible unichr
5534 escape.
5535
5536 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5537 unichr, so in this case it's the longest unichr escape. In
5538 narrow (UTF-16) builds this is five chars per source unichr
5539 since there are two unichrs in the surrogate pair, so in narrow
5540 (UTF-16) builds it's not the longest unichr escape.
5541
5542 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5543 so in the narrow (UTF-16) build case it's the longest unichr
5544 escape.
5545 */
5546
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005547 if (size == 0)
5548 return PyBytes_FromStringAndSize(NULL, 0);
5549
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005550 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005552
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005553 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 2
5555 + expandsize*size
5556 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 if (repr == NULL)
5558 return NULL;
5559
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005560 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 while (size-- > 0) {
5563 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005564
Walter Dörwald79e913e2007-05-12 11:08:06 +00005565 /* Escape backslashes */
5566 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 *p++ = '\\';
5568 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005569 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005570 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005571
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005572#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005573 /* Map 21-bit characters to '\U00xxxxxx' */
5574 else if (ch >= 0x10000) {
5575 *p++ = '\\';
5576 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005577 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5578 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5579 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5580 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5581 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5582 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5583 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5584 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005586 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005587#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5589 else if (ch >= 0xD800 && ch < 0xDC00) {
5590 Py_UNICODE ch2;
5591 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 ch2 = *s++;
5594 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005595 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5597 *p++ = '\\';
5598 *p++ = 'U';
5599 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5600 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5601 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5602 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5603 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5604 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5605 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5606 *p++ = hexdigits[ucs & 0x0000000F];
5607 continue;
5608 }
5609 /* Fall through: isolated surrogates are copied as-is */
5610 s--;
5611 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005612 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005613#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005614
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005616 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 *p++ = '\\';
5618 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005619 *p++ = hexdigits[(ch >> 12) & 0x000F];
5620 *p++ = hexdigits[(ch >> 8) & 0x000F];
5621 *p++ = hexdigits[(ch >> 4) & 0x000F];
5622 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005624
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005625 /* Map special whitespace to '\t', \n', '\r' */
5626 else if (ch == '\t') {
5627 *p++ = '\\';
5628 *p++ = 't';
5629 }
5630 else if (ch == '\n') {
5631 *p++ = '\\';
5632 *p++ = 'n';
5633 }
5634 else if (ch == '\r') {
5635 *p++ = '\\';
5636 *p++ = 'r';
5637 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005638
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005639 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005640 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005642 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005643 *p++ = hexdigits[(ch >> 4) & 0x000F];
5644 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005645 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005646
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 /* Copy everything else as-is */
5648 else
5649 *p++ = (char) ch;
5650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005652 assert(p - PyBytes_AS_STRING(repr) > 0);
5653 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5654 return NULL;
5655 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656}
5657
Alexander Belopolsky40018472011-02-26 01:02:56 +00005658PyObject *
5659PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005661 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 if (!PyUnicode_Check(unicode)) {
5663 PyErr_BadArgument();
5664 return NULL;
5665 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005666 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5667 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005668 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669}
5670
5671/* --- Raw Unicode Escape Codec ------------------------------------------- */
5672
Alexander Belopolsky40018472011-02-26 01:02:56 +00005673PyObject *
5674PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005675 Py_ssize_t size,
5676 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005679 Py_ssize_t startinpos;
5680 Py_ssize_t endinpos;
5681 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 const char *end;
5685 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 PyObject *errorHandler = NULL;
5687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 /* Escaped strings will always be longer than the resulting
5690 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 length after conversion to the true value. (But decoding error
5692 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 v = _PyUnicode_New(size);
5694 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 end = s + size;
5700 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 unsigned char c;
5702 Py_UCS4 x;
5703 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005704 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 /* Non-escape characters are interpreted as Unicode ordinals */
5707 if (*s != '\\') {
5708 *p++ = (unsigned char)*s++;
5709 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 startinpos = s-starts;
5712
5713 /* \u-escapes are only interpreted iff the number of leading
5714 backslashes if odd */
5715 bs = s;
5716 for (;s < end;) {
5717 if (*s != '\\')
5718 break;
5719 *p++ = (unsigned char)*s++;
5720 }
5721 if (((s - bs) & 1) == 0 ||
5722 s >= end ||
5723 (*s != 'u' && *s != 'U')) {
5724 continue;
5725 }
5726 p--;
5727 count = *s=='u' ? 4 : 8;
5728 s++;
5729
5730 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5731 outpos = p-PyUnicode_AS_UNICODE(v);
5732 for (x = 0, i = 0; i < count; ++i, ++s) {
5733 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005734 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 endinpos = s-starts;
5736 if (unicode_decode_call_errorhandler(
5737 errors, &errorHandler,
5738 "rawunicodeescape", "truncated \\uXXXX",
5739 &starts, &end, &startinpos, &endinpos, &exc, &s,
5740 &v, &outpos, &p))
5741 goto onError;
5742 goto nextByte;
5743 }
5744 x = (x<<4) & ~0xF;
5745 if (c >= '0' && c <= '9')
5746 x += c - '0';
5747 else if (c >= 'a' && c <= 'f')
5748 x += 10 + c - 'a';
5749 else
5750 x += 10 + c - 'A';
5751 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005752 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 /* UCS-2 character */
5754 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005755 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 /* UCS-4 character. Either store directly, or as
5757 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005758#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005760#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 x -= 0x10000L;
5762 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5763 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005764#endif
5765 } else {
5766 endinpos = s-starts;
5767 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005768 if (unicode_decode_call_errorhandler(
5769 errors, &errorHandler,
5770 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 &starts, &end, &startinpos, &endinpos, &exc, &s,
5772 &v, &outpos, &p))
5773 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005774 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 nextByte:
5776 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005778 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 Py_XDECREF(errorHandler);
5781 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005782 if (PyUnicode_READY(v) == -1) {
5783 Py_DECREF(v);
5784 return NULL;
5785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005787
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 Py_XDECREF(errorHandler);
5791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 return NULL;
5793}
5794
Alexander Belopolsky40018472011-02-26 01:02:56 +00005795PyObject *
5796PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005797 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005799 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800 char *p;
5801 char *q;
5802
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005803#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005804 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005805#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005806 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005807#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005808
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005809 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005812 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 if (repr == NULL)
5814 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005815 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005818 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 while (size-- > 0) {
5820 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005821#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 /* Map 32-bit characters to '\Uxxxxxxxx' */
5823 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005824 *p++ = '\\';
5825 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005826 *p++ = hexdigits[(ch >> 28) & 0xf];
5827 *p++ = hexdigits[(ch >> 24) & 0xf];
5828 *p++ = hexdigits[(ch >> 20) & 0xf];
5829 *p++ = hexdigits[(ch >> 16) & 0xf];
5830 *p++ = hexdigits[(ch >> 12) & 0xf];
5831 *p++ = hexdigits[(ch >> 8) & 0xf];
5832 *p++ = hexdigits[(ch >> 4) & 0xf];
5833 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005834 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005835 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005836#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5838 if (ch >= 0xD800 && ch < 0xDC00) {
5839 Py_UNICODE ch2;
5840 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005841
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 ch2 = *s++;
5843 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005844 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5846 *p++ = '\\';
5847 *p++ = 'U';
5848 *p++ = hexdigits[(ucs >> 28) & 0xf];
5849 *p++ = hexdigits[(ucs >> 24) & 0xf];
5850 *p++ = hexdigits[(ucs >> 20) & 0xf];
5851 *p++ = hexdigits[(ucs >> 16) & 0xf];
5852 *p++ = hexdigits[(ucs >> 12) & 0xf];
5853 *p++ = hexdigits[(ucs >> 8) & 0xf];
5854 *p++ = hexdigits[(ucs >> 4) & 0xf];
5855 *p++ = hexdigits[ucs & 0xf];
5856 continue;
5857 }
5858 /* Fall through: isolated surrogates are copied as-is */
5859 s--;
5860 size++;
5861 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005862#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 /* Map 16-bit characters to '\uxxxx' */
5864 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 *p++ = '\\';
5866 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005867 *p++ = hexdigits[(ch >> 12) & 0xf];
5868 *p++ = hexdigits[(ch >> 8) & 0xf];
5869 *p++ = hexdigits[(ch >> 4) & 0xf];
5870 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 /* Copy everything else as-is */
5873 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 *p++ = (char) ch;
5875 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005876 size = p - q;
5877
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005878 assert(size > 0);
5879 if (_PyBytes_Resize(&repr, size) < 0)
5880 return NULL;
5881 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882}
5883
Alexander Belopolsky40018472011-02-26 01:02:56 +00005884PyObject *
5885PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005887 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005889 PyErr_BadArgument();
5890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005892 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5893 PyUnicode_GET_SIZE(unicode));
5894
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005895 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896}
5897
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005898/* --- Unicode Internal Codec ------------------------------------------- */
5899
Alexander Belopolsky40018472011-02-26 01:02:56 +00005900PyObject *
5901_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005902 Py_ssize_t size,
5903 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005904{
5905 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005906 Py_ssize_t startinpos;
5907 Py_ssize_t endinpos;
5908 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005909 PyUnicodeObject *v;
5910 Py_UNICODE *p;
5911 const char *end;
5912 const char *reason;
5913 PyObject *errorHandler = NULL;
5914 PyObject *exc = NULL;
5915
Neal Norwitzd43069c2006-01-08 01:12:10 +00005916#ifdef Py_UNICODE_WIDE
5917 Py_UNICODE unimax = PyUnicode_GetMax();
5918#endif
5919
Thomas Wouters89f507f2006-12-13 04:49:30 +00005920 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005921 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5922 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005924 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5925 as string was created with the old API. */
5926 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005928 p = PyUnicode_AS_UNICODE(v);
5929 end = s + size;
5930
5931 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005932 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005933 /* We have to sanity check the raw data, otherwise doom looms for
5934 some malformed UCS-4 data. */
5935 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005936#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005937 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005938#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005939 end-s < Py_UNICODE_SIZE
5940 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005942 startinpos = s - starts;
5943 if (end-s < Py_UNICODE_SIZE) {
5944 endinpos = end-starts;
5945 reason = "truncated input";
5946 }
5947 else {
5948 endinpos = s - starts + Py_UNICODE_SIZE;
5949 reason = "illegal code point (> 0x10FFFF)";
5950 }
5951 outpos = p - PyUnicode_AS_UNICODE(v);
5952 if (unicode_decode_call_errorhandler(
5953 errors, &errorHandler,
5954 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005955 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005956 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005957 goto onError;
5958 }
5959 }
5960 else {
5961 p++;
5962 s += Py_UNICODE_SIZE;
5963 }
5964 }
5965
Victor Stinnerfe226c02011-10-03 03:52:20 +02005966 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005967 goto onError;
5968 Py_XDECREF(errorHandler);
5969 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005970 if (PyUnicode_READY(v) == -1) {
5971 Py_DECREF(v);
5972 return NULL;
5973 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005974 return (PyObject *)v;
5975
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005977 Py_XDECREF(v);
5978 Py_XDECREF(errorHandler);
5979 Py_XDECREF(exc);
5980 return NULL;
5981}
5982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983/* --- Latin-1 Codec ------------------------------------------------------ */
5984
Alexander Belopolsky40018472011-02-26 01:02:56 +00005985PyObject *
5986PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005987 Py_ssize_t size,
5988 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02005991 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992}
5993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005995static void
5996make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005997 const char *encoding,
5998 const Py_UNICODE *unicode, Py_ssize_t size,
5999 Py_ssize_t startpos, Py_ssize_t endpos,
6000 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 *exceptionObject = PyUnicodeEncodeError_Create(
6004 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
6006 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6008 goto onError;
6009 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6010 goto onError;
6011 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6012 goto onError;
6013 return;
6014 onError:
6015 Py_DECREF(*exceptionObject);
6016 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
6018}
6019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006021static void
6022raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006023 const char *encoding,
6024 const Py_UNICODE *unicode, Py_ssize_t size,
6025 Py_ssize_t startpos, Py_ssize_t endpos,
6026 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006027{
6028 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032}
6033
6034/* error handling callback helper:
6035 build arguments, call the callback and check the arguments,
6036 put the result into newpos and return the replacement string, which
6037 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006038static PyObject *
6039unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006040 PyObject **errorHandler,
6041 const char *encoding, const char *reason,
6042 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6043 Py_ssize_t startpos, Py_ssize_t endpos,
6044 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006046 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047
6048 PyObject *restuple;
6049 PyObject *resunicode;
6050
6051 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006055 }
6056
6057 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061
6062 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006067 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 Py_DECREF(restuple);
6069 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006071 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 &resunicode, newpos)) {
6073 Py_DECREF(restuple);
6074 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006076 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6077 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6078 Py_DECREF(restuple);
6079 return NULL;
6080 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006083 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6085 Py_DECREF(restuple);
6086 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006087 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006088 Py_INCREF(resunicode);
6089 Py_DECREF(restuple);
6090 return resunicode;
6091}
6092
Alexander Belopolsky40018472011-02-26 01:02:56 +00006093static PyObject *
6094unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006095 Py_ssize_t size,
6096 const char *errors,
6097 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098{
6099 /* output object */
6100 PyObject *res;
6101 /* pointers to the beginning and end+1 of input */
6102 const Py_UNICODE *startp = p;
6103 const Py_UNICODE *endp = p + size;
6104 /* pointer to the beginning of the unencodable characters */
6105 /* const Py_UNICODE *badp = NULL; */
6106 /* pointer into the output */
6107 char *str;
6108 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006109 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006110 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6111 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 PyObject *errorHandler = NULL;
6113 PyObject *exc = NULL;
6114 /* the following variable is used for caching string comparisons
6115 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6116 int known_errorHandler = -1;
6117
6118 /* allocate enough for a simple encoding without
6119 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006120 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006121 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006122 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006124 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006125 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 ressize = size;
6127
6128 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 /* can we encode this? */
6132 if (c<limit) {
6133 /* no overflow check, because we know that the space is enough */
6134 *str++ = (char)c;
6135 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006136 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 else {
6138 Py_ssize_t unicodepos = p-startp;
6139 Py_ssize_t requiredsize;
6140 PyObject *repunicode;
6141 Py_ssize_t repsize;
6142 Py_ssize_t newpos;
6143 Py_ssize_t respos;
6144 Py_UNICODE *uni2;
6145 /* startpos for collecting unencodable chars */
6146 const Py_UNICODE *collstart = p;
6147 const Py_UNICODE *collend = p;
6148 /* find all unecodable characters */
6149 while ((collend < endp) && ((*collend)>=limit))
6150 ++collend;
6151 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6152 if (known_errorHandler==-1) {
6153 if ((errors==NULL) || (!strcmp(errors, "strict")))
6154 known_errorHandler = 1;
6155 else if (!strcmp(errors, "replace"))
6156 known_errorHandler = 2;
6157 else if (!strcmp(errors, "ignore"))
6158 known_errorHandler = 3;
6159 else if (!strcmp(errors, "xmlcharrefreplace"))
6160 known_errorHandler = 4;
6161 else
6162 known_errorHandler = 0;
6163 }
6164 switch (known_errorHandler) {
6165 case 1: /* strict */
6166 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6167 goto onError;
6168 case 2: /* replace */
6169 while (collstart++<collend)
6170 *str++ = '?'; /* fall through */
6171 case 3: /* ignore */
6172 p = collend;
6173 break;
6174 case 4: /* xmlcharrefreplace */
6175 respos = str - PyBytes_AS_STRING(res);
6176 /* determine replacement size (temporarily (mis)uses p) */
6177 for (p = collstart, repsize = 0; p < collend; ++p) {
6178 if (*p<10)
6179 repsize += 2+1+1;
6180 else if (*p<100)
6181 repsize += 2+2+1;
6182 else if (*p<1000)
6183 repsize += 2+3+1;
6184 else if (*p<10000)
6185 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006186#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 else
6188 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006189#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 else if (*p<100000)
6191 repsize += 2+5+1;
6192 else if (*p<1000000)
6193 repsize += 2+6+1;
6194 else
6195 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006196#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 }
6198 requiredsize = respos+repsize+(endp-collend);
6199 if (requiredsize > ressize) {
6200 if (requiredsize<2*ressize)
6201 requiredsize = 2*ressize;
6202 if (_PyBytes_Resize(&res, requiredsize))
6203 goto onError;
6204 str = PyBytes_AS_STRING(res) + respos;
6205 ressize = requiredsize;
6206 }
6207 /* generate replacement (temporarily (mis)uses p) */
6208 for (p = collstart; p < collend; ++p) {
6209 str += sprintf(str, "&#%d;", (int)*p);
6210 }
6211 p = collend;
6212 break;
6213 default:
6214 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6215 encoding, reason, startp, size, &exc,
6216 collstart-startp, collend-startp, &newpos);
6217 if (repunicode == NULL)
6218 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006219 if (PyBytes_Check(repunicode)) {
6220 /* Directly copy bytes result to output. */
6221 repsize = PyBytes_Size(repunicode);
6222 if (repsize > 1) {
6223 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006224 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006225 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6226 Py_DECREF(repunicode);
6227 goto onError;
6228 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006229 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006230 ressize += repsize-1;
6231 }
6232 memcpy(str, PyBytes_AsString(repunicode), repsize);
6233 str += repsize;
6234 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006235 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006236 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006237 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 /* need more space? (at least enough for what we
6239 have+the replacement+the rest of the string, so
6240 we won't have to check space for encodable characters) */
6241 respos = str - PyBytes_AS_STRING(res);
6242 repsize = PyUnicode_GET_SIZE(repunicode);
6243 requiredsize = respos+repsize+(endp-collend);
6244 if (requiredsize > ressize) {
6245 if (requiredsize<2*ressize)
6246 requiredsize = 2*ressize;
6247 if (_PyBytes_Resize(&res, requiredsize)) {
6248 Py_DECREF(repunicode);
6249 goto onError;
6250 }
6251 str = PyBytes_AS_STRING(res) + respos;
6252 ressize = requiredsize;
6253 }
6254 /* check if there is anything unencodable in the replacement
6255 and copy it to the output */
6256 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6257 c = *uni2;
6258 if (c >= limit) {
6259 raise_encode_exception(&exc, encoding, startp, size,
6260 unicodepos, unicodepos+1, reason);
6261 Py_DECREF(repunicode);
6262 goto onError;
6263 }
6264 *str = (char)c;
6265 }
6266 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006267 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006268 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006269 }
6270 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006271 /* Resize if we allocated to much */
6272 size = str - PyBytes_AS_STRING(res);
6273 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006274 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006275 if (_PyBytes_Resize(&res, size) < 0)
6276 goto onError;
6277 }
6278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 Py_XDECREF(errorHandler);
6280 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006281 return res;
6282
6283 onError:
6284 Py_XDECREF(res);
6285 Py_XDECREF(errorHandler);
6286 Py_XDECREF(exc);
6287 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006288}
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290PyObject *
6291PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 Py_ssize_t size,
6293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296}
6297
Alexander Belopolsky40018472011-02-26 01:02:56 +00006298PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006299_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300{
6301 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 PyErr_BadArgument();
6303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006305 if (PyUnicode_READY(unicode) == -1)
6306 return NULL;
6307 /* Fast path: if it is a one-byte string, construct
6308 bytes object directly. */
6309 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6310 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6311 PyUnicode_GET_LENGTH(unicode));
6312 /* Non-Latin-1 characters present. Defer to above function to
6313 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006316 errors);
6317}
6318
6319PyObject*
6320PyUnicode_AsLatin1String(PyObject *unicode)
6321{
6322 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323}
6324
6325/* --- 7-bit ASCII Codec -------------------------------------------------- */
6326
Alexander Belopolsky40018472011-02-26 01:02:56 +00006327PyObject *
6328PyUnicode_DecodeASCII(const char *s,
6329 Py_ssize_t size,
6330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333 PyUnicodeObject *v;
6334 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006335 Py_ssize_t startinpos;
6336 Py_ssize_t endinpos;
6337 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006339 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340 PyObject *errorHandler = NULL;
6341 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006342 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006343
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006345 if (size == 1 && *(unsigned char*)s < 128)
6346 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6347
6348 /* Fast path. Assume the input actually *is* ASCII, and allocate
6349 a single-block Unicode object with that assumption. If there is
6350 an error, drop the object and start over. */
6351 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6352 if (v == NULL)
6353 goto onError;
6354 d = PyUnicode_1BYTE_DATA(v);
6355 for (i = 0; i < size; i++) {
6356 unsigned char ch = ((unsigned char*)s)[i];
6357 if (ch < 128)
6358 d[i] = ch;
6359 else
6360 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006362 if (i == size)
6363 return (PyObject*)v;
6364 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006365
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 v = _PyUnicode_New(size);
6367 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 e = s + size;
6373 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 register unsigned char c = (unsigned char)*s;
6375 if (c < 128) {
6376 *p++ = c;
6377 ++s;
6378 }
6379 else {
6380 startinpos = s-starts;
6381 endinpos = startinpos + 1;
6382 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6383 if (unicode_decode_call_errorhandler(
6384 errors, &errorHandler,
6385 "ascii", "ordinal not in range(128)",
6386 &starts, &e, &startinpos, &endinpos, &exc, &s,
6387 &v, &outpos, &p))
6388 goto onError;
6389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006391 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006392 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394 Py_XDECREF(errorHandler);
6395 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006396 if (PyUnicode_READY(v) == -1) {
6397 Py_DECREF(v);
6398 return NULL;
6399 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006401
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 Py_XDECREF(errorHandler);
6405 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 return NULL;
6407}
6408
Alexander Belopolsky40018472011-02-26 01:02:56 +00006409PyObject *
6410PyUnicode_EncodeASCII(const Py_UNICODE *p,
6411 Py_ssize_t size,
6412 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415}
6416
Alexander Belopolsky40018472011-02-26 01:02:56 +00006417PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419{
6420 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 PyErr_BadArgument();
6422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006424 if (PyUnicode_READY(unicode) == -1)
6425 return NULL;
6426 /* Fast path: if it is an ASCII-only string, construct bytes object
6427 directly. Else defer to above function to raise the exception. */
6428 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6429 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6430 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006433 errors);
6434}
6435
6436PyObject *
6437PyUnicode_AsASCIIString(PyObject *unicode)
6438{
6439 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
Victor Stinner99b95382011-07-04 14:23:54 +02006442#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006443
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006444/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006445
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006446#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006447#define NEED_RETRY
6448#endif
6449
6450/* XXX This code is limited to "true" double-byte encodings, as
6451 a) it assumes an incomplete character consists of a single byte, and
6452 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006454
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455static int
6456is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006457{
6458 const char *curr = s + offset;
6459
6460 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 const char *prev = CharPrev(s, curr);
6462 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006463 }
6464 return 0;
6465}
6466
6467/*
6468 * Decode MBCS string into unicode object. If 'final' is set, converts
6469 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6470 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006471static int
6472decode_mbcs(PyUnicodeObject **v,
6473 const char *s, /* MBCS string */
6474 int size, /* sizeof MBCS string */
6475 int final,
6476 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006477{
6478 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006479 Py_ssize_t n;
6480 DWORD usize;
6481 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006482
6483 assert(size >= 0);
6484
Victor Stinner554f3f02010-06-16 23:33:54 +00006485 /* check and handle 'errors' arg */
6486 if (errors==NULL || strcmp(errors, "strict")==0)
6487 flags = MB_ERR_INVALID_CHARS;
6488 else if (strcmp(errors, "ignore")==0)
6489 flags = 0;
6490 else {
6491 PyErr_Format(PyExc_ValueError,
6492 "mbcs encoding does not support errors='%s'",
6493 errors);
6494 return -1;
6495 }
6496
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006497 /* Skip trailing lead-byte unless 'final' is set */
6498 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006500
6501 /* First get the size of the result */
6502 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006503 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6504 if (usize==0)
6505 goto mbcs_decode_error;
6506 } else
6507 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006508
6509 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 /* Create unicode object */
6511 *v = _PyUnicode_New(usize);
6512 if (*v == NULL)
6513 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006514 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006515 }
6516 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 /* Extend unicode object */
6518 n = PyUnicode_GET_SIZE(*v);
Victor Stinnerfe226c02011-10-03 03:52:20 +02006519 if (PyUnicode_Resize(v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006521 }
6522
6523 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006524 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006526 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6527 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006529 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006530 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006531
6532mbcs_decode_error:
6533 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6534 we raise a UnicodeDecodeError - else it is a 'generic'
6535 windows error
6536 */
6537 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6538 /* Ideally, we should get reason from FormatMessage - this
6539 is the Windows 2000 English version of the message
6540 */
6541 PyObject *exc = NULL;
6542 const char *reason = "No mapping for the Unicode character exists "
6543 "in the target multi-byte code page.";
6544 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6545 if (exc != NULL) {
6546 PyCodec_StrictErrors(exc);
6547 Py_DECREF(exc);
6548 }
6549 } else {
6550 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6551 }
6552 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006553}
6554
Alexander Belopolsky40018472011-02-26 01:02:56 +00006555PyObject *
6556PyUnicode_DecodeMBCSStateful(const char *s,
6557 Py_ssize_t size,
6558 const char *errors,
6559 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006560{
6561 PyUnicodeObject *v = NULL;
6562 int done;
6563
6564 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006566
6567#ifdef NEED_RETRY
6568 retry:
6569 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006570 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006571 else
6572#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006573 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006574
6575 if (done < 0) {
6576 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006578 }
6579
6580 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006582
6583#ifdef NEED_RETRY
6584 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 s += done;
6586 size -= done;
6587 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588 }
6589#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006590 if (PyUnicode_READY(v) == -1) {
6591 Py_DECREF(v);
6592 return NULL;
6593 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 return (PyObject *)v;
6595}
6596
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597PyObject *
6598PyUnicode_DecodeMBCS(const char *s,
6599 Py_ssize_t size,
6600 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006601{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6603}
6604
6605/*
6606 * Convert unicode into string object (MBCS).
6607 * Returns 0 if succeed, -1 otherwise.
6608 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609static int
6610encode_mbcs(PyObject **repr,
6611 const Py_UNICODE *p, /* unicode */
6612 int size, /* size of unicode */
6613 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006614{
Victor Stinner554f3f02010-06-16 23:33:54 +00006615 BOOL usedDefaultChar = FALSE;
6616 BOOL *pusedDefaultChar;
6617 int mbcssize;
6618 Py_ssize_t n;
6619 PyObject *exc = NULL;
6620 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006621
6622 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006623
Victor Stinner554f3f02010-06-16 23:33:54 +00006624 /* check and handle 'errors' arg */
6625 if (errors==NULL || strcmp(errors, "strict")==0) {
6626 flags = WC_NO_BEST_FIT_CHARS;
6627 pusedDefaultChar = &usedDefaultChar;
6628 } else if (strcmp(errors, "replace")==0) {
6629 flags = 0;
6630 pusedDefaultChar = NULL;
6631 } else {
6632 PyErr_Format(PyExc_ValueError,
6633 "mbcs encoding does not support errors='%s'",
6634 errors);
6635 return -1;
6636 }
6637
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006638 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006639 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006640 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6641 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 if (mbcssize == 0) {
6643 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6644 return -1;
6645 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006646 /* If we used a default char, then we failed! */
6647 if (pusedDefaultChar && *pusedDefaultChar)
6648 goto mbcs_encode_error;
6649 } else {
6650 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006651 }
6652
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006653 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 /* Create string object */
6655 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6656 if (*repr == NULL)
6657 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006658 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006659 }
6660 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 /* Extend string object */
6662 n = PyBytes_Size(*repr);
6663 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6664 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006665 }
6666
6667 /* Do the conversion */
6668 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006670 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6671 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6673 return -1;
6674 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006675 if (pusedDefaultChar && *pusedDefaultChar)
6676 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006678 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006679
6680mbcs_encode_error:
6681 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6682 Py_XDECREF(exc);
6683 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006684}
6685
Alexander Belopolsky40018472011-02-26 01:02:56 +00006686PyObject *
6687PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6688 Py_ssize_t size,
6689 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006690{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691 PyObject *repr = NULL;
6692 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006693
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006696 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006697 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006698 else
6699#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006700 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006701
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 Py_XDECREF(repr);
6704 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006705 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706
6707#ifdef NEED_RETRY
6708 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 p += INT_MAX;
6710 size -= INT_MAX;
6711 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006712 }
6713#endif
6714
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006715 return repr;
6716}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006717
Alexander Belopolsky40018472011-02-26 01:02:56 +00006718PyObject *
6719PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006720{
6721 if (!PyUnicode_Check(unicode)) {
6722 PyErr_BadArgument();
6723 return NULL;
6724 }
6725 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 PyUnicode_GET_SIZE(unicode),
6727 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006728}
6729
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730#undef NEED_RETRY
6731
Victor Stinner99b95382011-07-04 14:23:54 +02006732#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734/* --- Character Mapping Codec -------------------------------------------- */
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736PyObject *
6737PyUnicode_DecodeCharmap(const char *s,
6738 Py_ssize_t size,
6739 PyObject *mapping,
6740 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006743 Py_ssize_t startinpos;
6744 Py_ssize_t endinpos;
6745 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 PyUnicodeObject *v;
6748 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006749 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006750 PyObject *errorHandler = NULL;
6751 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006752 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006753 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006754
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 /* Default to Latin-1 */
6756 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758
6759 v = _PyUnicode_New(size);
6760 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006766 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 mapstring = PyUnicode_AS_UNICODE(mapping);
6768 maplen = PyUnicode_GET_SIZE(mapping);
6769 while (s < e) {
6770 unsigned char ch = *s;
6771 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 if (ch < maplen)
6774 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 if (x == 0xfffe) {
6777 /* undefined mapping */
6778 outpos = p-PyUnicode_AS_UNICODE(v);
6779 startinpos = s-starts;
6780 endinpos = startinpos+1;
6781 if (unicode_decode_call_errorhandler(
6782 errors, &errorHandler,
6783 "charmap", "character maps to <undefined>",
6784 &starts, &e, &startinpos, &endinpos, &exc, &s,
6785 &v, &outpos, &p)) {
6786 goto onError;
6787 }
6788 continue;
6789 }
6790 *p++ = x;
6791 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006792 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006793 }
6794 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 while (s < e) {
6796 unsigned char ch = *s;
6797 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006798
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6800 w = PyLong_FromLong((long)ch);
6801 if (w == NULL)
6802 goto onError;
6803 x = PyObject_GetItem(mapping, w);
6804 Py_DECREF(w);
6805 if (x == NULL) {
6806 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6807 /* No mapping found means: mapping is undefined. */
6808 PyErr_Clear();
6809 x = Py_None;
6810 Py_INCREF(x);
6811 } else
6812 goto onError;
6813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006814
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 /* Apply mapping */
6816 if (PyLong_Check(x)) {
6817 long value = PyLong_AS_LONG(x);
6818 if (value < 0 || value > 65535) {
6819 PyErr_SetString(PyExc_TypeError,
6820 "character mapping must be in range(65536)");
6821 Py_DECREF(x);
6822 goto onError;
6823 }
6824 *p++ = (Py_UNICODE)value;
6825 }
6826 else if (x == Py_None) {
6827 /* undefined mapping */
6828 outpos = p-PyUnicode_AS_UNICODE(v);
6829 startinpos = s-starts;
6830 endinpos = startinpos+1;
6831 if (unicode_decode_call_errorhandler(
6832 errors, &errorHandler,
6833 "charmap", "character maps to <undefined>",
6834 &starts, &e, &startinpos, &endinpos, &exc, &s,
6835 &v, &outpos, &p)) {
6836 Py_DECREF(x);
6837 goto onError;
6838 }
6839 Py_DECREF(x);
6840 continue;
6841 }
6842 else if (PyUnicode_Check(x)) {
6843 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006844
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 if (targetsize == 1)
6846 /* 1-1 mapping */
6847 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006848
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 else if (targetsize > 1) {
6850 /* 1-n mapping */
6851 if (targetsize > extrachars) {
6852 /* resize first */
6853 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6854 Py_ssize_t needed = (targetsize - extrachars) + \
6855 (targetsize << 2);
6856 extrachars += needed;
6857 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006858 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 PyUnicode_GET_SIZE(v) + needed) < 0) {
6860 Py_DECREF(x);
6861 goto onError;
6862 }
6863 p = PyUnicode_AS_UNICODE(v) + oldpos;
6864 }
6865 Py_UNICODE_COPY(p,
6866 PyUnicode_AS_UNICODE(x),
6867 targetsize);
6868 p += targetsize;
6869 extrachars -= targetsize;
6870 }
6871 /* 1-0 mapping: skip the character */
6872 }
6873 else {
6874 /* wrong return value */
6875 PyErr_SetString(PyExc_TypeError,
6876 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006877 Py_DECREF(x);
6878 goto onError;
6879 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 Py_DECREF(x);
6881 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006882 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 }
6884 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006885 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006887 Py_XDECREF(errorHandler);
6888 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006889 if (PyUnicode_READY(v) == -1) {
6890 Py_DECREF(v);
6891 return NULL;
6892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006894
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 Py_XDECREF(errorHandler);
6897 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 Py_XDECREF(v);
6899 return NULL;
6900}
6901
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006902/* Charmap encoding: the lookup table */
6903
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 PyObject_HEAD
6906 unsigned char level1[32];
6907 int count2, count3;
6908 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006909};
6910
6911static PyObject*
6912encoding_map_size(PyObject *obj, PyObject* args)
6913{
6914 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006915 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006917}
6918
6919static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 PyDoc_STR("Return the size (in bytes) of this object") },
6922 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006923};
6924
6925static void
6926encoding_map_dealloc(PyObject* o)
6927{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006928 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006929}
6930
6931static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006932 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 "EncodingMap", /*tp_name*/
6934 sizeof(struct encoding_map), /*tp_basicsize*/
6935 0, /*tp_itemsize*/
6936 /* methods */
6937 encoding_map_dealloc, /*tp_dealloc*/
6938 0, /*tp_print*/
6939 0, /*tp_getattr*/
6940 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006941 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 0, /*tp_repr*/
6943 0, /*tp_as_number*/
6944 0, /*tp_as_sequence*/
6945 0, /*tp_as_mapping*/
6946 0, /*tp_hash*/
6947 0, /*tp_call*/
6948 0, /*tp_str*/
6949 0, /*tp_getattro*/
6950 0, /*tp_setattro*/
6951 0, /*tp_as_buffer*/
6952 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6953 0, /*tp_doc*/
6954 0, /*tp_traverse*/
6955 0, /*tp_clear*/
6956 0, /*tp_richcompare*/
6957 0, /*tp_weaklistoffset*/
6958 0, /*tp_iter*/
6959 0, /*tp_iternext*/
6960 encoding_map_methods, /*tp_methods*/
6961 0, /*tp_members*/
6962 0, /*tp_getset*/
6963 0, /*tp_base*/
6964 0, /*tp_dict*/
6965 0, /*tp_descr_get*/
6966 0, /*tp_descr_set*/
6967 0, /*tp_dictoffset*/
6968 0, /*tp_init*/
6969 0, /*tp_alloc*/
6970 0, /*tp_new*/
6971 0, /*tp_free*/
6972 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006973};
6974
6975PyObject*
6976PyUnicode_BuildEncodingMap(PyObject* string)
6977{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006978 PyObject *result;
6979 struct encoding_map *mresult;
6980 int i;
6981 int need_dict = 0;
6982 unsigned char level1[32];
6983 unsigned char level2[512];
6984 unsigned char *mlevel1, *mlevel2, *mlevel3;
6985 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006986 int kind;
6987 void *data;
6988 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006990 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006991 PyErr_BadArgument();
6992 return NULL;
6993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006994 kind = PyUnicode_KIND(string);
6995 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006996 memset(level1, 0xFF, sizeof level1);
6997 memset(level2, 0xFF, sizeof level2);
6998
6999 /* If there isn't a one-to-one mapping of NULL to \0,
7000 or if there are non-BMP characters, we need to use
7001 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007002 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007003 need_dict = 1;
7004 for (i = 1; i < 256; i++) {
7005 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007006 ch = PyUnicode_READ(kind, data, i);
7007 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007008 need_dict = 1;
7009 break;
7010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007011 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007012 /* unmapped character */
7013 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 l1 = ch >> 11;
7015 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007016 if (level1[l1] == 0xFF)
7017 level1[l1] = count2++;
7018 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007019 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007020 }
7021
7022 if (count2 >= 0xFF || count3 >= 0xFF)
7023 need_dict = 1;
7024
7025 if (need_dict) {
7026 PyObject *result = PyDict_New();
7027 PyObject *key, *value;
7028 if (!result)
7029 return NULL;
7030 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007031 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007032 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007033 if (!key || !value)
7034 goto failed1;
7035 if (PyDict_SetItem(result, key, value) == -1)
7036 goto failed1;
7037 Py_DECREF(key);
7038 Py_DECREF(value);
7039 }
7040 return result;
7041 failed1:
7042 Py_XDECREF(key);
7043 Py_XDECREF(value);
7044 Py_DECREF(result);
7045 return NULL;
7046 }
7047
7048 /* Create a three-level trie */
7049 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7050 16*count2 + 128*count3 - 1);
7051 if (!result)
7052 return PyErr_NoMemory();
7053 PyObject_Init(result, &EncodingMapType);
7054 mresult = (struct encoding_map*)result;
7055 mresult->count2 = count2;
7056 mresult->count3 = count3;
7057 mlevel1 = mresult->level1;
7058 mlevel2 = mresult->level23;
7059 mlevel3 = mresult->level23 + 16*count2;
7060 memcpy(mlevel1, level1, 32);
7061 memset(mlevel2, 0xFF, 16*count2);
7062 memset(mlevel3, 0, 128*count3);
7063 count3 = 0;
7064 for (i = 1; i < 256; i++) {
7065 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007066 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007067 /* unmapped character */
7068 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007069 o1 = PyUnicode_READ(kind, data, i)>>11;
7070 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007071 i2 = 16*mlevel1[o1] + o2;
7072 if (mlevel2[i2] == 0xFF)
7073 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007074 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007075 i3 = 128*mlevel2[i2] + o3;
7076 mlevel3[i3] = i;
7077 }
7078 return result;
7079}
7080
7081static int
7082encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7083{
7084 struct encoding_map *map = (struct encoding_map*)mapping;
7085 int l1 = c>>11;
7086 int l2 = (c>>7) & 0xF;
7087 int l3 = c & 0x7F;
7088 int i;
7089
7090#ifdef Py_UNICODE_WIDE
7091 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007093 }
7094#endif
7095 if (c == 0)
7096 return 0;
7097 /* level 1*/
7098 i = map->level1[l1];
7099 if (i == 0xFF) {
7100 return -1;
7101 }
7102 /* level 2*/
7103 i = map->level23[16*i+l2];
7104 if (i == 0xFF) {
7105 return -1;
7106 }
7107 /* level 3 */
7108 i = map->level23[16*map->count2 + 128*i + l3];
7109 if (i == 0) {
7110 return -1;
7111 }
7112 return i;
7113}
7114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007115/* Lookup the character ch in the mapping. If the character
7116 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007117 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007118static PyObject *
7119charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120{
Christian Heimes217cfd12007-12-02 14:31:20 +00007121 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007122 PyObject *x;
7123
7124 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007126 x = PyObject_GetItem(mapping, w);
7127 Py_DECREF(w);
7128 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7130 /* No mapping found means: mapping is undefined. */
7131 PyErr_Clear();
7132 x = Py_None;
7133 Py_INCREF(x);
7134 return x;
7135 } else
7136 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007138 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007140 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 long value = PyLong_AS_LONG(x);
7142 if (value < 0 || value > 255) {
7143 PyErr_SetString(PyExc_TypeError,
7144 "character mapping must be in range(256)");
7145 Py_DECREF(x);
7146 return NULL;
7147 }
7148 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007150 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 /* wrong return value */
7154 PyErr_Format(PyExc_TypeError,
7155 "character mapping must return integer, bytes or None, not %.400s",
7156 x->ob_type->tp_name);
7157 Py_DECREF(x);
7158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 }
7160}
7161
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007162static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007163charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007164{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007165 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7166 /* exponentially overallocate to minimize reallocations */
7167 if (requiredsize < 2*outsize)
7168 requiredsize = 2*outsize;
7169 if (_PyBytes_Resize(outobj, requiredsize))
7170 return -1;
7171 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007172}
7173
Benjamin Peterson14339b62009-01-31 16:36:08 +00007174typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007176} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007177/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007178 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007179 space is available. Return a new reference to the object that
7180 was put in the output buffer, or Py_None, if the mapping was undefined
7181 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007182 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007183static charmapencode_result
7184charmapencode_output(Py_UNICODE c, PyObject *mapping,
7185 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007186{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007187 PyObject *rep;
7188 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007189 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007190
Christian Heimes90aa7642007-12-19 02:45:37 +00007191 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007192 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007194 if (res == -1)
7195 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 if (outsize<requiredsize)
7197 if (charmapencode_resize(outobj, outpos, requiredsize))
7198 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007199 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 outstart[(*outpos)++] = (char)res;
7201 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007202 }
7203
7204 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007205 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007207 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 Py_DECREF(rep);
7209 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007210 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 if (PyLong_Check(rep)) {
7212 Py_ssize_t requiredsize = *outpos+1;
7213 if (outsize<requiredsize)
7214 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7215 Py_DECREF(rep);
7216 return enc_EXCEPTION;
7217 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007218 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 else {
7222 const char *repchars = PyBytes_AS_STRING(rep);
7223 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7224 Py_ssize_t requiredsize = *outpos+repsize;
7225 if (outsize<requiredsize)
7226 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7227 Py_DECREF(rep);
7228 return enc_EXCEPTION;
7229 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007230 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 memcpy(outstart + *outpos, repchars, repsize);
7232 *outpos += repsize;
7233 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007234 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007235 Py_DECREF(rep);
7236 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007237}
7238
7239/* handle an error in PyUnicode_EncodeCharmap
7240 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007241static int
7242charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007243 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007244 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007245 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007246 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007247{
7248 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249 Py_ssize_t repsize;
7250 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007251 Py_UNICODE *uni2;
7252 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 Py_ssize_t collstartpos = *inpos;
7254 Py_ssize_t collendpos = *inpos+1;
7255 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256 char *encoding = "charmap";
7257 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007258 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260 /* find all unencodable characters */
7261 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007262 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007263 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 int res = encoding_map_lookup(p[collendpos], mapping);
7265 if (res != -1)
7266 break;
7267 ++collendpos;
7268 continue;
7269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007270
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 rep = charmapencode_lookup(p[collendpos], mapping);
7272 if (rep==NULL)
7273 return -1;
7274 else if (rep!=Py_None) {
7275 Py_DECREF(rep);
7276 break;
7277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007278 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007280 }
7281 /* cache callback name lookup
7282 * (if not done yet, i.e. it's the first error) */
7283 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 if ((errors==NULL) || (!strcmp(errors, "strict")))
7285 *known_errorHandler = 1;
7286 else if (!strcmp(errors, "replace"))
7287 *known_errorHandler = 2;
7288 else if (!strcmp(errors, "ignore"))
7289 *known_errorHandler = 3;
7290 else if (!strcmp(errors, "xmlcharrefreplace"))
7291 *known_errorHandler = 4;
7292 else
7293 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007294 }
7295 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007296 case 1: /* strict */
7297 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7298 return -1;
7299 case 2: /* replace */
7300 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 x = charmapencode_output('?', mapping, res, respos);
7302 if (x==enc_EXCEPTION) {
7303 return -1;
7304 }
7305 else if (x==enc_FAILED) {
7306 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7307 return -1;
7308 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007309 }
7310 /* fall through */
7311 case 3: /* ignore */
7312 *inpos = collendpos;
7313 break;
7314 case 4: /* xmlcharrefreplace */
7315 /* generate replacement (temporarily (mis)uses p) */
7316 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 char buffer[2+29+1+1];
7318 char *cp;
7319 sprintf(buffer, "&#%d;", (int)p[collpos]);
7320 for (cp = buffer; *cp; ++cp) {
7321 x = charmapencode_output(*cp, mapping, res, respos);
7322 if (x==enc_EXCEPTION)
7323 return -1;
7324 else if (x==enc_FAILED) {
7325 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7326 return -1;
7327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007328 }
7329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007330 *inpos = collendpos;
7331 break;
7332 default:
7333 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 encoding, reason, p, size, exceptionObject,
7335 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007336 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007338 if (PyBytes_Check(repunicode)) {
7339 /* Directly copy bytes result to output. */
7340 Py_ssize_t outsize = PyBytes_Size(*res);
7341 Py_ssize_t requiredsize;
7342 repsize = PyBytes_Size(repunicode);
7343 requiredsize = *respos + repsize;
7344 if (requiredsize > outsize)
7345 /* Make room for all additional bytes. */
7346 if (charmapencode_resize(res, respos, requiredsize)) {
7347 Py_DECREF(repunicode);
7348 return -1;
7349 }
7350 memcpy(PyBytes_AsString(*res) + *respos,
7351 PyBytes_AsString(repunicode), repsize);
7352 *respos += repsize;
7353 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007354 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007355 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007357 /* generate replacement */
7358 repsize = PyUnicode_GET_SIZE(repunicode);
7359 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 x = charmapencode_output(*uni2, mapping, res, respos);
7361 if (x==enc_EXCEPTION) {
7362 return -1;
7363 }
7364 else if (x==enc_FAILED) {
7365 Py_DECREF(repunicode);
7366 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7367 return -1;
7368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007369 }
7370 *inpos = newpos;
7371 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007372 }
7373 return 0;
7374}
7375
Alexander Belopolsky40018472011-02-26 01:02:56 +00007376PyObject *
7377PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7378 Py_ssize_t size,
7379 PyObject *mapping,
7380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007382 /* output object */
7383 PyObject *res = NULL;
7384 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007385 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007386 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007387 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 PyObject *errorHandler = NULL;
7389 PyObject *exc = NULL;
7390 /* the following variable is used for caching string comparisons
7391 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7392 * 3=ignore, 4=xmlcharrefreplace */
7393 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394
7395 /* Default to Latin-1 */
7396 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007399 /* allocate enough for a simple encoding without
7400 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007401 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007402 if (res == NULL)
7403 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007404 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 /* try to encode it */
7409 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7410 if (x==enc_EXCEPTION) /* error */
7411 goto onError;
7412 if (x==enc_FAILED) { /* unencodable character */
7413 if (charmap_encoding_error(p, size, &inpos, mapping,
7414 &exc,
7415 &known_errorHandler, &errorHandler, errors,
7416 &res, &respos)) {
7417 goto onError;
7418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007419 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 else
7421 /* done with this character => adjust input position */
7422 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007425 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007426 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007427 if (_PyBytes_Resize(&res, respos) < 0)
7428 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007430 Py_XDECREF(exc);
7431 Py_XDECREF(errorHandler);
7432 return res;
7433
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007435 Py_XDECREF(res);
7436 Py_XDECREF(exc);
7437 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 return NULL;
7439}
7440
Alexander Belopolsky40018472011-02-26 01:02:56 +00007441PyObject *
7442PyUnicode_AsCharmapString(PyObject *unicode,
7443 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444{
7445 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 PyErr_BadArgument();
7447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448 }
7449 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 PyUnicode_GET_SIZE(unicode),
7451 mapping,
7452 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453}
7454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007455/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007456static void
7457make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007458 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007459 Py_ssize_t startpos, Py_ssize_t endpos,
7460 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007462 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007463 *exceptionObject = _PyUnicodeTranslateError_Create(
7464 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465 }
7466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7468 goto onError;
7469 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7470 goto onError;
7471 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7472 goto onError;
7473 return;
7474 onError:
7475 Py_DECREF(*exceptionObject);
7476 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 }
7478}
7479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007480/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007481static void
7482raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007483 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007484 Py_ssize_t startpos, Py_ssize_t endpos,
7485 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486{
7487 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007488 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007489 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007491}
7492
7493/* error handling callback helper:
7494 build arguments, call the callback and check the arguments,
7495 put the result into newpos and return the replacement string, which
7496 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497static PyObject *
7498unicode_translate_call_errorhandler(const char *errors,
7499 PyObject **errorHandler,
7500 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007501 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007502 Py_ssize_t startpos, Py_ssize_t endpos,
7503 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007505 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007507 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 PyObject *restuple;
7509 PyObject *resunicode;
7510
7511 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007513 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007515 }
7516
7517 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007518 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521
7522 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007524 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007526 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007527 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 Py_DECREF(restuple);
7529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 }
7531 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 &resunicode, &i_newpos)) {
7533 Py_DECREF(restuple);
7534 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007535 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007536 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007537 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007538 else
7539 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007540 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7542 Py_DECREF(restuple);
7543 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007544 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545 Py_INCREF(resunicode);
7546 Py_DECREF(restuple);
7547 return resunicode;
7548}
7549
7550/* Lookup the character ch in the mapping and put the result in result,
7551 which must be decrefed by the caller.
7552 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007553static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007554charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555{
Christian Heimes217cfd12007-12-02 14:31:20 +00007556 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 PyObject *x;
7558
7559 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561 x = PyObject_GetItem(mapping, w);
7562 Py_DECREF(w);
7563 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7565 /* No mapping found means: use 1:1 mapping. */
7566 PyErr_Clear();
7567 *result = NULL;
7568 return 0;
7569 } else
7570 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007571 }
7572 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 *result = x;
7574 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007575 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007576 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 long value = PyLong_AS_LONG(x);
7578 long max = PyUnicode_GetMax();
7579 if (value < 0 || value > max) {
7580 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007581 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 Py_DECREF(x);
7583 return -1;
7584 }
7585 *result = x;
7586 return 0;
7587 }
7588 else if (PyUnicode_Check(x)) {
7589 *result = x;
7590 return 0;
7591 }
7592 else {
7593 /* wrong return value */
7594 PyErr_SetString(PyExc_TypeError,
7595 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007596 Py_DECREF(x);
7597 return -1;
7598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599}
7600/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 if not reallocate and adjust various state variables.
7602 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007603static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007604charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007606{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007607 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007608 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 /* exponentially overallocate to minimize reallocations */
7610 if (requiredsize < 2 * oldsize)
7611 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7613 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007615 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007616 }
7617 return 0;
7618}
7619/* lookup the character, put the result in the output string and adjust
7620 various state variables. Return a new reference to the object that
7621 was put in the output buffer in *result, or Py_None, if the mapping was
7622 undefined (in which case no character was written).
7623 The called must decref result.
7624 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007625static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007626charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7627 PyObject *mapping, Py_UCS4 **output,
7628 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007629 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7632 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007636 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 }
7638 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007640 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007642 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643 }
7644 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007645 Py_ssize_t repsize;
7646 if (PyUnicode_READY(*res) == -1)
7647 return -1;
7648 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 if (repsize==1) {
7650 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007651 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 }
7653 else if (repsize!=0) {
7654 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007655 Py_ssize_t requiredsize = *opos +
7656 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007658 Py_ssize_t i;
7659 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007661 for(i = 0; i < repsize; i++)
7662 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007664 }
7665 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667 return 0;
7668}
7669
Alexander Belopolsky40018472011-02-26 01:02:56 +00007670PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007671_PyUnicode_TranslateCharmap(PyObject *input,
7672 PyObject *mapping,
7673 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007675 /* input object */
7676 char *idata;
7677 Py_ssize_t size, i;
7678 int kind;
7679 /* output buffer */
7680 Py_UCS4 *output = NULL;
7681 Py_ssize_t osize;
7682 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007683 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007685 char *reason = "character maps to <undefined>";
7686 PyObject *errorHandler = NULL;
7687 PyObject *exc = NULL;
7688 /* the following variable is used for caching string comparisons
7689 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7690 * 3=ignore, 4=xmlcharrefreplace */
7691 int known_errorHandler = -1;
7692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 PyErr_BadArgument();
7695 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007698 if (PyUnicode_READY(input) == -1)
7699 return NULL;
7700 idata = (char*)PyUnicode_DATA(input);
7701 kind = PyUnicode_KIND(input);
7702 size = PyUnicode_GET_LENGTH(input);
7703 i = 0;
7704
7705 if (size == 0) {
7706 Py_INCREF(input);
7707 return input;
7708 }
7709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 /* allocate enough for a simple 1:1 translation without
7711 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007712 osize = size;
7713 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7714 opos = 0;
7715 if (output == NULL) {
7716 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007720 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 /* try to encode it */
7722 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007723 if (charmaptranslate_output(input, i, mapping,
7724 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 Py_XDECREF(x);
7726 goto onError;
7727 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007728 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 else { /* untranslatable character */
7732 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7733 Py_ssize_t repsize;
7734 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007737 Py_ssize_t collstart = i;
7738 Py_ssize_t collend = i+1;
7739 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 while (collend < size) {
7743 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 goto onError;
7745 Py_XDECREF(x);
7746 if (x!=Py_None)
7747 break;
7748 ++collend;
7749 }
7750 /* cache callback name lookup
7751 * (if not done yet, i.e. it's the first error) */
7752 if (known_errorHandler==-1) {
7753 if ((errors==NULL) || (!strcmp(errors, "strict")))
7754 known_errorHandler = 1;
7755 else if (!strcmp(errors, "replace"))
7756 known_errorHandler = 2;
7757 else if (!strcmp(errors, "ignore"))
7758 known_errorHandler = 3;
7759 else if (!strcmp(errors, "xmlcharrefreplace"))
7760 known_errorHandler = 4;
7761 else
7762 known_errorHandler = 0;
7763 }
7764 switch (known_errorHandler) {
7765 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007766 raise_translate_exception(&exc, input, collstart,
7767 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007768 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 case 2: /* replace */
7770 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 for (coll = collstart; coll<collend; coll++)
7772 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 /* fall through */
7774 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007775 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 break;
7777 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007778 /* generate replacement (temporarily (mis)uses i) */
7779 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 char buffer[2+29+1+1];
7781 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007782 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7783 if (charmaptranslate_makespace(&output, &osize,
7784 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 goto onError;
7786 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007789 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 break;
7791 default:
7792 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 reason, input, &exc,
7794 collstart, collend, &newpos);
7795 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 goto onError;
7797 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798 repsize = PyUnicode_GET_LENGTH(repunicode);
7799 if (charmaptranslate_makespace(&output, &osize,
7800 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 Py_DECREF(repunicode);
7802 goto onError;
7803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 for (uni2 = 0; repsize-->0; ++uni2)
7805 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7806 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007809 }
7810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007811 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7812 if (!res)
7813 goto onError;
7814 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 Py_XDECREF(exc);
7816 Py_XDECREF(errorHandler);
7817 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 Py_XDECREF(exc);
7822 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 return NULL;
7824}
7825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826/* Deprecated. Use PyUnicode_Translate instead. */
7827PyObject *
7828PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7829 Py_ssize_t size,
7830 PyObject *mapping,
7831 const char *errors)
7832{
7833 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7834 if (!unicode)
7835 return NULL;
7836 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7837}
7838
Alexander Belopolsky40018472011-02-26 01:02:56 +00007839PyObject *
7840PyUnicode_Translate(PyObject *str,
7841 PyObject *mapping,
7842 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843{
7844 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007845
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846 str = PyUnicode_FromObject(str);
7847 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007849 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 Py_DECREF(str);
7851 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007852
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 Py_XDECREF(str);
7855 return NULL;
7856}
Tim Petersced69f82003-09-16 20:30:58 +00007857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858static Py_UCS4
7859fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7860{
7861 /* No need to call PyUnicode_READY(self) because this function is only
7862 called as a callback from fixup() which does it already. */
7863 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7864 const int kind = PyUnicode_KIND(self);
7865 void *data = PyUnicode_DATA(self);
7866 Py_UCS4 maxchar = 0, ch, fixed;
7867 Py_ssize_t i;
7868
7869 for (i = 0; i < len; ++i) {
7870 ch = PyUnicode_READ(kind, data, i);
7871 fixed = 0;
7872 if (ch > 127) {
7873 if (Py_UNICODE_ISSPACE(ch))
7874 fixed = ' ';
7875 else {
7876 const int decimal = Py_UNICODE_TODECIMAL(ch);
7877 if (decimal >= 0)
7878 fixed = '0' + decimal;
7879 }
7880 if (fixed != 0) {
7881 if (fixed > maxchar)
7882 maxchar = fixed;
7883 PyUnicode_WRITE(kind, data, i, fixed);
7884 }
7885 else if (ch > maxchar)
7886 maxchar = ch;
7887 }
7888 else if (ch > maxchar)
7889 maxchar = ch;
7890 }
7891
7892 return maxchar;
7893}
7894
7895PyObject *
7896_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7897{
7898 if (!PyUnicode_Check(unicode)) {
7899 PyErr_BadInternalCall();
7900 return NULL;
7901 }
7902 if (PyUnicode_READY(unicode) == -1)
7903 return NULL;
7904 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7905 /* If the string is already ASCII, just return the same string */
7906 Py_INCREF(unicode);
7907 return unicode;
7908 }
7909 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7910}
7911
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007912PyObject *
7913PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7914 Py_ssize_t length)
7915{
7916 PyObject *result;
7917 Py_UNICODE *p; /* write pointer into result */
7918 Py_ssize_t i;
7919 /* Copy to a new string */
7920 result = (PyObject *)_PyUnicode_New(length);
7921 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7922 if (result == NULL)
7923 return result;
7924 p = PyUnicode_AS_UNICODE(result);
7925 /* Iterate over code points */
7926 for (i = 0; i < length; i++) {
7927 Py_UNICODE ch =s[i];
7928 if (ch > 127) {
7929 int decimal = Py_UNICODE_TODECIMAL(ch);
7930 if (decimal >= 0)
7931 p[i] = '0' + decimal;
7932 }
7933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7935 Py_DECREF(result);
7936 return NULL;
7937 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007938 return result;
7939}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007940/* --- Decimal Encoder ---------------------------------------------------- */
7941
Alexander Belopolsky40018472011-02-26 01:02:56 +00007942int
7943PyUnicode_EncodeDecimal(Py_UNICODE *s,
7944 Py_ssize_t length,
7945 char *output,
7946 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007947{
7948 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 PyObject *errorHandler = NULL;
7950 PyObject *exc = NULL;
7951 const char *encoding = "decimal";
7952 const char *reason = "invalid decimal Unicode string";
7953 /* the following variable is used for caching string comparisons
7954 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7955 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007956
7957 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 PyErr_BadArgument();
7959 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007960 }
7961
7962 p = s;
7963 end = s + length;
7964 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 register Py_UNICODE ch = *p;
7966 int decimal;
7967 PyObject *repunicode;
7968 Py_ssize_t repsize;
7969 Py_ssize_t newpos;
7970 Py_UNICODE *uni2;
7971 Py_UNICODE *collstart;
7972 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007973
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007975 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 ++p;
7977 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007978 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 decimal = Py_UNICODE_TODECIMAL(ch);
7980 if (decimal >= 0) {
7981 *output++ = '0' + decimal;
7982 ++p;
7983 continue;
7984 }
7985 if (0 < ch && ch < 256) {
7986 *output++ = (char)ch;
7987 ++p;
7988 continue;
7989 }
7990 /* All other characters are considered unencodable */
7991 collstart = p;
7992 collend = p+1;
7993 while (collend < end) {
7994 if ((0 < *collend && *collend < 256) ||
7995 !Py_UNICODE_ISSPACE(*collend) ||
7996 Py_UNICODE_TODECIMAL(*collend))
7997 break;
7998 }
7999 /* cache callback name lookup
8000 * (if not done yet, i.e. it's the first error) */
8001 if (known_errorHandler==-1) {
8002 if ((errors==NULL) || (!strcmp(errors, "strict")))
8003 known_errorHandler = 1;
8004 else if (!strcmp(errors, "replace"))
8005 known_errorHandler = 2;
8006 else if (!strcmp(errors, "ignore"))
8007 known_errorHandler = 3;
8008 else if (!strcmp(errors, "xmlcharrefreplace"))
8009 known_errorHandler = 4;
8010 else
8011 known_errorHandler = 0;
8012 }
8013 switch (known_errorHandler) {
8014 case 1: /* strict */
8015 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8016 goto onError;
8017 case 2: /* replace */
8018 for (p = collstart; p < collend; ++p)
8019 *output++ = '?';
8020 /* fall through */
8021 case 3: /* ignore */
8022 p = collend;
8023 break;
8024 case 4: /* xmlcharrefreplace */
8025 /* generate replacement (temporarily (mis)uses p) */
8026 for (p = collstart; p < collend; ++p)
8027 output += sprintf(output, "&#%d;", (int)*p);
8028 p = collend;
8029 break;
8030 default:
8031 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8032 encoding, reason, s, length, &exc,
8033 collstart-s, collend-s, &newpos);
8034 if (repunicode == NULL)
8035 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008036 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008037 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008038 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8039 Py_DECREF(repunicode);
8040 goto onError;
8041 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 /* generate replacement */
8043 repsize = PyUnicode_GET_SIZE(repunicode);
8044 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8045 Py_UNICODE ch = *uni2;
8046 if (Py_UNICODE_ISSPACE(ch))
8047 *output++ = ' ';
8048 else {
8049 decimal = Py_UNICODE_TODECIMAL(ch);
8050 if (decimal >= 0)
8051 *output++ = '0' + decimal;
8052 else if (0 < ch && ch < 256)
8053 *output++ = (char)ch;
8054 else {
8055 Py_DECREF(repunicode);
8056 raise_encode_exception(&exc, encoding,
8057 s, length, collstart-s, collend-s, reason);
8058 goto onError;
8059 }
8060 }
8061 }
8062 p = s + newpos;
8063 Py_DECREF(repunicode);
8064 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008065 }
8066 /* 0-terminate the output string */
8067 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008068 Py_XDECREF(exc);
8069 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008070 return 0;
8071
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073 Py_XDECREF(exc);
8074 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008075 return -1;
8076}
8077
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078/* --- Helpers ------------------------------------------------------------ */
8079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008080#include "stringlib/ucs1lib.h"
8081#include "stringlib/fastsearch.h"
8082#include "stringlib/partition.h"
8083#include "stringlib/split.h"
8084#include "stringlib/count.h"
8085#include "stringlib/find.h"
8086#include "stringlib/localeutil.h"
8087#include "stringlib/undef.h"
8088
8089#include "stringlib/ucs2lib.h"
8090#include "stringlib/fastsearch.h"
8091#include "stringlib/partition.h"
8092#include "stringlib/split.h"
8093#include "stringlib/count.h"
8094#include "stringlib/find.h"
8095#include "stringlib/localeutil.h"
8096#include "stringlib/undef.h"
8097
8098#include "stringlib/ucs4lib.h"
8099#include "stringlib/fastsearch.h"
8100#include "stringlib/partition.h"
8101#include "stringlib/split.h"
8102#include "stringlib/count.h"
8103#include "stringlib/find.h"
8104#include "stringlib/localeutil.h"
8105#include "stringlib/undef.h"
8106
8107static Py_ssize_t
8108any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8109 const Py_UCS1*, Py_ssize_t,
8110 Py_ssize_t, Py_ssize_t),
8111 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8112 const Py_UCS2*, Py_ssize_t,
8113 Py_ssize_t, Py_ssize_t),
8114 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8115 const Py_UCS4*, Py_ssize_t,
8116 Py_ssize_t, Py_ssize_t),
8117 PyObject* s1, PyObject* s2,
8118 Py_ssize_t start,
8119 Py_ssize_t end)
8120{
8121 int kind1, kind2, kind;
8122 void *buf1, *buf2;
8123 Py_ssize_t len1, len2, result;
8124
8125 kind1 = PyUnicode_KIND(s1);
8126 kind2 = PyUnicode_KIND(s2);
8127 kind = kind1 > kind2 ? kind1 : kind2;
8128 buf1 = PyUnicode_DATA(s1);
8129 buf2 = PyUnicode_DATA(s2);
8130 if (kind1 != kind)
8131 buf1 = _PyUnicode_AsKind(s1, kind);
8132 if (!buf1)
8133 return -2;
8134 if (kind2 != kind)
8135 buf2 = _PyUnicode_AsKind(s2, kind);
8136 if (!buf2) {
8137 if (kind1 != kind) PyMem_Free(buf1);
8138 return -2;
8139 }
8140 len1 = PyUnicode_GET_LENGTH(s1);
8141 len2 = PyUnicode_GET_LENGTH(s2);
8142
8143 switch(kind) {
8144 case PyUnicode_1BYTE_KIND:
8145 result = ucs1(buf1, len1, buf2, len2, start, end);
8146 break;
8147 case PyUnicode_2BYTE_KIND:
8148 result = ucs2(buf1, len1, buf2, len2, start, end);
8149 break;
8150 case PyUnicode_4BYTE_KIND:
8151 result = ucs4(buf1, len1, buf2, len2, start, end);
8152 break;
8153 default:
8154 assert(0); result = -2;
8155 }
8156
8157 if (kind1 != kind)
8158 PyMem_Free(buf1);
8159 if (kind2 != kind)
8160 PyMem_Free(buf2);
8161
8162 return result;
8163}
8164
8165Py_ssize_t
8166_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8167 Py_ssize_t n_buffer,
8168 void *digits, Py_ssize_t n_digits,
8169 Py_ssize_t min_width,
8170 const char *grouping,
8171 const char *thousands_sep)
8172{
8173 switch(kind) {
8174 case PyUnicode_1BYTE_KIND:
8175 return _PyUnicode_ucs1_InsertThousandsGrouping(
8176 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8177 min_width, grouping, thousands_sep);
8178 case PyUnicode_2BYTE_KIND:
8179 return _PyUnicode_ucs2_InsertThousandsGrouping(
8180 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8181 min_width, grouping, thousands_sep);
8182 case PyUnicode_4BYTE_KIND:
8183 return _PyUnicode_ucs4_InsertThousandsGrouping(
8184 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8185 min_width, grouping, thousands_sep);
8186 }
8187 assert(0);
8188 return -1;
8189}
8190
8191
Eric Smith8c663262007-08-25 02:26:07 +00008192#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008193#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008194
Thomas Wouters477c8d52006-05-27 19:21:47 +00008195#include "stringlib/count.h"
8196#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008197
Thomas Wouters477c8d52006-05-27 19:21:47 +00008198/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008199#define ADJUST_INDICES(start, end, len) \
8200 if (end > len) \
8201 end = len; \
8202 else if (end < 0) { \
8203 end += len; \
8204 if (end < 0) \
8205 end = 0; \
8206 } \
8207 if (start < 0) { \
8208 start += len; \
8209 if (start < 0) \
8210 start = 0; \
8211 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008212
Alexander Belopolsky40018472011-02-26 01:02:56 +00008213Py_ssize_t
8214PyUnicode_Count(PyObject *str,
8215 PyObject *substr,
8216 Py_ssize_t start,
8217 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008219 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008220 PyUnicodeObject* str_obj;
8221 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 int kind1, kind2, kind;
8223 void *buf1 = NULL, *buf2 = NULL;
8224 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008225
Thomas Wouters477c8d52006-05-27 19:21:47 +00008226 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008227 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008229 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008230 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 Py_DECREF(str_obj);
8232 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 }
Tim Petersced69f82003-09-16 20:30:58 +00008234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 kind1 = PyUnicode_KIND(str_obj);
8236 kind2 = PyUnicode_KIND(sub_obj);
8237 kind = kind1 > kind2 ? kind1 : kind2;
8238 buf1 = PyUnicode_DATA(str_obj);
8239 if (kind1 != kind)
8240 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8241 if (!buf1)
8242 goto onError;
8243 buf2 = PyUnicode_DATA(sub_obj);
8244 if (kind2 != kind)
8245 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8246 if (!buf2)
8247 goto onError;
8248 len1 = PyUnicode_GET_LENGTH(str_obj);
8249 len2 = PyUnicode_GET_LENGTH(sub_obj);
8250
8251 ADJUST_INDICES(start, end, len1);
8252 switch(kind) {
8253 case PyUnicode_1BYTE_KIND:
8254 result = ucs1lib_count(
8255 ((Py_UCS1*)buf1) + start, end - start,
8256 buf2, len2, PY_SSIZE_T_MAX
8257 );
8258 break;
8259 case PyUnicode_2BYTE_KIND:
8260 result = ucs2lib_count(
8261 ((Py_UCS2*)buf1) + start, end - start,
8262 buf2, len2, PY_SSIZE_T_MAX
8263 );
8264 break;
8265 case PyUnicode_4BYTE_KIND:
8266 result = ucs4lib_count(
8267 ((Py_UCS4*)buf1) + start, end - start,
8268 buf2, len2, PY_SSIZE_T_MAX
8269 );
8270 break;
8271 default:
8272 assert(0); result = 0;
8273 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274
8275 Py_DECREF(sub_obj);
8276 Py_DECREF(str_obj);
8277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 if (kind1 != kind)
8279 PyMem_Free(buf1);
8280 if (kind2 != kind)
8281 PyMem_Free(buf2);
8282
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 onError:
8285 Py_DECREF(sub_obj);
8286 Py_DECREF(str_obj);
8287 if (kind1 != kind && buf1)
8288 PyMem_Free(buf1);
8289 if (kind2 != kind && buf2)
8290 PyMem_Free(buf2);
8291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292}
8293
Alexander Belopolsky40018472011-02-26 01:02:56 +00008294Py_ssize_t
8295PyUnicode_Find(PyObject *str,
8296 PyObject *sub,
8297 Py_ssize_t start,
8298 Py_ssize_t end,
8299 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008302
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008306 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 Py_DECREF(str);
8309 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310 }
Tim Petersced69f82003-09-16 20:30:58 +00008311
Thomas Wouters477c8d52006-05-27 19:21:47 +00008312 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 result = any_find_slice(
8314 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8315 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008316 );
8317 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 result = any_find_slice(
8319 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8320 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321 );
8322
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324 Py_DECREF(sub);
8325
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 return result;
8327}
8328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329Py_ssize_t
8330PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8331 Py_ssize_t start, Py_ssize_t end,
8332 int direction)
8333{
8334 char *result;
8335 int kind;
8336 if (PyUnicode_READY(str) == -1)
8337 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008338 if (start < 0 || end < 0) {
8339 PyErr_SetString(PyExc_IndexError, "string index out of range");
8340 return -2;
8341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 if (end > PyUnicode_GET_LENGTH(str))
8343 end = PyUnicode_GET_LENGTH(str);
8344 kind = PyUnicode_KIND(str);
8345 result = findchar(PyUnicode_1BYTE_DATA(str)
8346 + PyUnicode_KIND_SIZE(kind, start),
8347 kind,
8348 end-start, ch, direction);
8349 if (!result)
8350 return -1;
8351 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8352}
8353
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354static int
8355tailmatch(PyUnicodeObject *self,
8356 PyUnicodeObject *substring,
8357 Py_ssize_t start,
8358 Py_ssize_t end,
8359 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 int kind_self;
8362 int kind_sub;
8363 void *data_self;
8364 void *data_sub;
8365 Py_ssize_t offset;
8366 Py_ssize_t i;
8367 Py_ssize_t end_sub;
8368
8369 if (PyUnicode_READY(self) == -1 ||
8370 PyUnicode_READY(substring) == -1)
8371 return 0;
8372
8373 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 return 1;
8375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8377 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 kind_self = PyUnicode_KIND(self);
8382 data_self = PyUnicode_DATA(self);
8383 kind_sub = PyUnicode_KIND(substring);
8384 data_sub = PyUnicode_DATA(substring);
8385 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8386
8387 if (direction > 0)
8388 offset = end;
8389 else
8390 offset = start;
8391
8392 if (PyUnicode_READ(kind_self, data_self, offset) ==
8393 PyUnicode_READ(kind_sub, data_sub, 0) &&
8394 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8395 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8396 /* If both are of the same kind, memcmp is sufficient */
8397 if (kind_self == kind_sub) {
8398 return ! memcmp((char *)data_self +
8399 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8400 data_sub,
8401 PyUnicode_GET_LENGTH(substring) *
8402 PyUnicode_CHARACTER_SIZE(substring));
8403 }
8404 /* otherwise we have to compare each character by first accesing it */
8405 else {
8406 /* We do not need to compare 0 and len(substring)-1 because
8407 the if statement above ensured already that they are equal
8408 when we end up here. */
8409 // TODO: honor direction and do a forward or backwards search
8410 for (i = 1; i < end_sub; ++i) {
8411 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8412 PyUnicode_READ(kind_sub, data_sub, i))
8413 return 0;
8414 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 }
8418
8419 return 0;
8420}
8421
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422Py_ssize_t
8423PyUnicode_Tailmatch(PyObject *str,
8424 PyObject *substr,
8425 Py_ssize_t start,
8426 Py_ssize_t end,
8427 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008430
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 str = PyUnicode_FromObject(str);
8432 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 substr = PyUnicode_FromObject(substr);
8435 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 Py_DECREF(str);
8437 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438 }
Tim Petersced69f82003-09-16 20:30:58 +00008439
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 (PyUnicodeObject *)substr,
8442 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 Py_DECREF(str);
8444 Py_DECREF(substr);
8445 return result;
8446}
8447
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448/* Apply fixfct filter to the Unicode object self and return a
8449 reference to the modified object */
8450
Alexander Belopolsky40018472011-02-26 01:02:56 +00008451static PyObject *
8452fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 PyObject *u;
8456 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 if (PyUnicode_READY(self) == -1)
8459 return NULL;
8460 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8461 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8462 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8467 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 /* fix functions return the new maximum character in a string,
8470 if the kind of the resulting unicode object does not change,
8471 everything is fine. Otherwise we need to change the string kind
8472 and re-run the fix function. */
8473 maxchar_new = fixfct((PyUnicodeObject*)u);
8474 if (maxchar_new == 0)
8475 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8476 else if (maxchar_new <= 127)
8477 maxchar_new = 127;
8478 else if (maxchar_new <= 255)
8479 maxchar_new = 255;
8480 else if (maxchar_new <= 65535)
8481 maxchar_new = 65535;
8482 else
8483 maxchar_new = 1114111; /* 0x10ffff */
8484
8485 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 /* fixfct should return TRUE if it modified the buffer. If
8487 FALSE, return a reference to the original buffer instead
8488 (to save space, not time) */
8489 Py_INCREF(self);
8490 Py_DECREF(u);
8491 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 else if (maxchar_new == maxchar_old) {
8494 return u;
8495 }
8496 else {
8497 /* In case the maximum character changed, we need to
8498 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008499 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 if (v == NULL) {
8501 Py_DECREF(u);
8502 return NULL;
8503 }
8504 if (maxchar_new > maxchar_old) {
8505 /* If the maxchar increased so that the kind changed, not all
8506 characters are representable anymore and we need to fix the
8507 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008508 if (PyUnicode_CopyCharacters(v, 0,
8509 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008510 PyUnicode_GET_LENGTH(self)) < 0)
8511 {
8512 Py_DECREF(u);
8513 return NULL;
8514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 maxchar_old = fixfct((PyUnicodeObject*)v);
8516 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8517 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008518 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008519 if (PyUnicode_CopyCharacters(v, 0,
8520 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008521 PyUnicode_GET_LENGTH(self)) < 0)
8522 {
8523 Py_DECREF(u);
8524 return NULL;
8525 }
8526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527
8528 Py_DECREF(u);
8529 return v;
8530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531}
8532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 /* No need to call PyUnicode_READY(self) because this function is only
8537 called as a callback from fixup() which does it already. */
8538 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8539 const int kind = PyUnicode_KIND(self);
8540 void *data = PyUnicode_DATA(self);
8541 int touched = 0;
8542 Py_UCS4 maxchar = 0;
8543 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 for (i = 0; i < len; ++i) {
8546 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8547 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8548 if (up != ch) {
8549 if (up > maxchar)
8550 maxchar = up;
8551 PyUnicode_WRITE(kind, data, i, up);
8552 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 else if (ch > maxchar)
8555 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556 }
8557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 if (touched)
8559 return maxchar;
8560 else
8561 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562}
8563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008565fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8568 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8569 const int kind = PyUnicode_KIND(self);
8570 void *data = PyUnicode_DATA(self);
8571 int touched = 0;
8572 Py_UCS4 maxchar = 0;
8573 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 for(i = 0; i < len; ++i) {
8576 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8577 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8578 if (lo != ch) {
8579 if (lo > maxchar)
8580 maxchar = lo;
8581 PyUnicode_WRITE(kind, data, i, lo);
8582 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 else if (ch > maxchar)
8585 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 }
8587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 if (touched)
8589 return maxchar;
8590 else
8591 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592}
8593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008595fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8598 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8599 const int kind = PyUnicode_KIND(self);
8600 void *data = PyUnicode_DATA(self);
8601 int touched = 0;
8602 Py_UCS4 maxchar = 0;
8603 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 for(i = 0; i < len; ++i) {
8606 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8607 Py_UCS4 nu = 0;
8608
8609 if (Py_UNICODE_ISUPPER(ch))
8610 nu = Py_UNICODE_TOLOWER(ch);
8611 else if (Py_UNICODE_ISLOWER(ch))
8612 nu = Py_UNICODE_TOUPPER(ch);
8613
8614 if (nu != 0) {
8615 if (nu > maxchar)
8616 maxchar = nu;
8617 PyUnicode_WRITE(kind, data, i, nu);
8618 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 else if (ch > maxchar)
8621 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 }
8623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 if (touched)
8625 return maxchar;
8626 else
8627 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628}
8629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8634 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8635 const int kind = PyUnicode_KIND(self);
8636 void *data = PyUnicode_DATA(self);
8637 int touched = 0;
8638 Py_UCS4 maxchar = 0;
8639 Py_ssize_t i = 0;
8640 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008641
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008642 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644
8645 ch = PyUnicode_READ(kind, data, i);
8646 if (!Py_UNICODE_ISUPPER(ch)) {
8647 maxchar = Py_UNICODE_TOUPPER(ch);
8648 PyUnicode_WRITE(kind, data, i, maxchar);
8649 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 ++i;
8652 for(; i < len; ++i) {
8653 ch = PyUnicode_READ(kind, data, i);
8654 if (!Py_UNICODE_ISLOWER(ch)) {
8655 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8656 if (lo > maxchar)
8657 maxchar = lo;
8658 PyUnicode_WRITE(kind, data, i, lo);
8659 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 else if (ch > maxchar)
8662 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664
8665 if (touched)
8666 return maxchar;
8667 else
8668 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669}
8670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8675 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8676 const int kind = PyUnicode_KIND(self);
8677 void *data = PyUnicode_DATA(self);
8678 Py_UCS4 maxchar = 0;
8679 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 int previous_is_cased;
8681
8682 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 if (len == 1) {
8684 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8685 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8686 if (ti != ch) {
8687 PyUnicode_WRITE(kind, data, i, ti);
8688 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 }
8690 else
8691 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 for(; i < len; ++i) {
8695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8696 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008697
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 nu = Py_UNICODE_TOTITLE(ch);
8702
8703 if (nu > maxchar)
8704 maxchar = nu;
8705 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008706
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 if (Py_UNICODE_ISLOWER(ch) ||
8708 Py_UNICODE_ISUPPER(ch) ||
8709 Py_UNICODE_ISTITLE(ch))
8710 previous_is_cased = 1;
8711 else
8712 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715}
8716
Tim Peters8ce9f162004-08-27 01:49:32 +00008717PyObject *
8718PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008721 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008723 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008724 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8725 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008726 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 Py_ssize_t sz, i, res_offset;
8728 Py_UCS4 maxchar = 0;
8729 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730
Tim Peters05eba1f2004-08-27 21:32:02 +00008731 fseq = PySequence_Fast(seq, "");
8732 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008734 }
8735
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008736 /* NOTE: the following code can't call back into Python code,
8737 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008738 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008739
Tim Peters05eba1f2004-08-27 21:32:02 +00008740 seqlen = PySequence_Fast_GET_SIZE(fseq);
8741 /* If empty sequence, return u"". */
8742 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008744 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008745 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008746 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008747 /* If singleton sequence with an exact Unicode, return that. */
8748 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 item = items[0];
8750 if (PyUnicode_CheckExact(item)) {
8751 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 goto Done;
8754 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008755 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008756 else {
8757 /* Set up sep and seplen */
8758 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 /* fall back to a blank space separator */
8760 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008761 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008763 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008764 else {
8765 if (!PyUnicode_Check(separator)) {
8766 PyErr_Format(PyExc_TypeError,
8767 "separator: expected str instance,"
8768 " %.80s found",
8769 Py_TYPE(separator)->tp_name);
8770 goto onError;
8771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 if (PyUnicode_READY(separator) == -1)
8773 goto onError;
8774 sep = separator;
8775 seplen = PyUnicode_GET_LENGTH(separator);
8776 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8777 /* inc refcount to keep this code path symetric with the
8778 above case of a blank separator */
8779 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008780 }
8781 }
8782
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008783 /* There are at least two things to join, or else we have a subclass
8784 * of str in the sequence.
8785 * Do a pre-pass to figure out the total amount of space we'll
8786 * need (sz), and see whether all argument are strings.
8787 */
8788 sz = 0;
8789 for (i = 0; i < seqlen; i++) {
8790 const Py_ssize_t old_sz = sz;
8791 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 if (!PyUnicode_Check(item)) {
8793 PyErr_Format(PyExc_TypeError,
8794 "sequence item %zd: expected str instance,"
8795 " %.80s found",
8796 i, Py_TYPE(item)->tp_name);
8797 goto onError;
8798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 if (PyUnicode_READY(item) == -1)
8800 goto onError;
8801 sz += PyUnicode_GET_LENGTH(item);
8802 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8803 if (item_maxchar > maxchar)
8804 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008805 if (i != 0)
8806 sz += seplen;
8807 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8808 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008810 goto onError;
8811 }
8812 }
Tim Petersced69f82003-09-16 20:30:58 +00008813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008815 if (res == NULL)
8816 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008817
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008818 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008820 Py_ssize_t itemlen;
8821 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 /* Copy item, and maybe the separator. */
8824 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008825 if (PyUnicode_CopyCharacters(res, res_offset,
8826 sep, 0, seplen) < 0)
8827 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008830 if (PyUnicode_CopyCharacters(res, res_offset,
8831 item, 0, itemlen) < 0)
8832 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008836
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008838 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 Py_XDECREF(sep);
8840 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008843 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008845 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846 return NULL;
8847}
8848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849#define FILL(kind, data, value, start, length) \
8850 do { \
8851 Py_ssize_t i_ = 0; \
8852 assert(kind != PyUnicode_WCHAR_KIND); \
8853 switch ((kind)) { \
8854 case PyUnicode_1BYTE_KIND: { \
8855 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8856 memset(to_, (unsigned char)value, length); \
8857 break; \
8858 } \
8859 case PyUnicode_2BYTE_KIND: { \
8860 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8861 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8862 break; \
8863 } \
8864 default: { \
8865 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8866 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8867 break; \
8868 } \
8869 } \
8870 } while (0)
8871
Alexander Belopolsky40018472011-02-26 01:02:56 +00008872static PyUnicodeObject *
8873pad(PyUnicodeObject *self,
8874 Py_ssize_t left,
8875 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 PyObject *u;
8879 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008880 int kind;
8881 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882
8883 if (left < 0)
8884 left = 0;
8885 if (right < 0)
8886 right = 0;
8887
Tim Peters7a29bd52001-09-12 03:03:31 +00008888 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 Py_INCREF(self);
8890 return self;
8891 }
8892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8894 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008895 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8896 return NULL;
8897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8899 if (fill > maxchar)
8900 maxchar = fill;
8901 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008902 if (!u)
8903 return NULL;
8904
8905 kind = PyUnicode_KIND(u);
8906 data = PyUnicode_DATA(u);
8907 if (left)
8908 FILL(kind, data, fill, 0, left);
8909 if (right)
8910 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008911 if (PyUnicode_CopyCharacters(u, left,
8912 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008913 _PyUnicode_LENGTH(self)) < 0)
8914 {
8915 Py_DECREF(u);
8916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917 }
8918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922
Alexander Belopolsky40018472011-02-26 01:02:56 +00008923PyObject *
8924PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927
8928 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 switch(PyUnicode_KIND(string)) {
8933 case PyUnicode_1BYTE_KIND:
8934 list = ucs1lib_splitlines(
8935 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8936 PyUnicode_GET_LENGTH(string), keepends);
8937 break;
8938 case PyUnicode_2BYTE_KIND:
8939 list = ucs2lib_splitlines(
8940 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8941 PyUnicode_GET_LENGTH(string), keepends);
8942 break;
8943 case PyUnicode_4BYTE_KIND:
8944 list = ucs4lib_splitlines(
8945 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8946 PyUnicode_GET_LENGTH(string), keepends);
8947 break;
8948 default:
8949 assert(0);
8950 list = 0;
8951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 Py_DECREF(string);
8953 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954}
8955
Alexander Belopolsky40018472011-02-26 01:02:56 +00008956static PyObject *
8957split(PyUnicodeObject *self,
8958 PyUnicodeObject *substring,
8959 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 int kind1, kind2, kind;
8962 void *buf1, *buf2;
8963 Py_ssize_t len1, len2;
8964 PyObject* out;
8965
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008967 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 if (PyUnicode_READY(self) == -1)
8970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972 if (substring == NULL)
8973 switch(PyUnicode_KIND(self)) {
8974 case PyUnicode_1BYTE_KIND:
8975 return ucs1lib_split_whitespace(
8976 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8977 PyUnicode_GET_LENGTH(self), maxcount
8978 );
8979 case PyUnicode_2BYTE_KIND:
8980 return ucs2lib_split_whitespace(
8981 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8982 PyUnicode_GET_LENGTH(self), maxcount
8983 );
8984 case PyUnicode_4BYTE_KIND:
8985 return ucs4lib_split_whitespace(
8986 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8987 PyUnicode_GET_LENGTH(self), maxcount
8988 );
8989 default:
8990 assert(0);
8991 return NULL;
8992 }
8993
8994 if (PyUnicode_READY(substring) == -1)
8995 return NULL;
8996
8997 kind1 = PyUnicode_KIND(self);
8998 kind2 = PyUnicode_KIND(substring);
8999 kind = kind1 > kind2 ? kind1 : kind2;
9000 buf1 = PyUnicode_DATA(self);
9001 buf2 = PyUnicode_DATA(substring);
9002 if (kind1 != kind)
9003 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9004 if (!buf1)
9005 return NULL;
9006 if (kind2 != kind)
9007 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9008 if (!buf2) {
9009 if (kind1 != kind) PyMem_Free(buf1);
9010 return NULL;
9011 }
9012 len1 = PyUnicode_GET_LENGTH(self);
9013 len2 = PyUnicode_GET_LENGTH(substring);
9014
9015 switch(kind) {
9016 case PyUnicode_1BYTE_KIND:
9017 out = ucs1lib_split(
9018 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9019 break;
9020 case PyUnicode_2BYTE_KIND:
9021 out = ucs2lib_split(
9022 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9023 break;
9024 case PyUnicode_4BYTE_KIND:
9025 out = ucs4lib_split(
9026 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9027 break;
9028 default:
9029 out = NULL;
9030 }
9031 if (kind1 != kind)
9032 PyMem_Free(buf1);
9033 if (kind2 != kind)
9034 PyMem_Free(buf2);
9035 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036}
9037
Alexander Belopolsky40018472011-02-26 01:02:56 +00009038static PyObject *
9039rsplit(PyUnicodeObject *self,
9040 PyUnicodeObject *substring,
9041 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 int kind1, kind2, kind;
9044 void *buf1, *buf2;
9045 Py_ssize_t len1, len2;
9046 PyObject* out;
9047
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009048 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009049 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 if (PyUnicode_READY(self) == -1)
9052 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (substring == NULL)
9055 switch(PyUnicode_KIND(self)) {
9056 case PyUnicode_1BYTE_KIND:
9057 return ucs1lib_rsplit_whitespace(
9058 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9059 PyUnicode_GET_LENGTH(self), maxcount
9060 );
9061 case PyUnicode_2BYTE_KIND:
9062 return ucs2lib_rsplit_whitespace(
9063 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9064 PyUnicode_GET_LENGTH(self), maxcount
9065 );
9066 case PyUnicode_4BYTE_KIND:
9067 return ucs4lib_rsplit_whitespace(
9068 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9069 PyUnicode_GET_LENGTH(self), maxcount
9070 );
9071 default:
9072 assert(0);
9073 return NULL;
9074 }
9075
9076 if (PyUnicode_READY(substring) == -1)
9077 return NULL;
9078
9079 kind1 = PyUnicode_KIND(self);
9080 kind2 = PyUnicode_KIND(substring);
9081 kind = kind1 > kind2 ? kind1 : kind2;
9082 buf1 = PyUnicode_DATA(self);
9083 buf2 = PyUnicode_DATA(substring);
9084 if (kind1 != kind)
9085 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9086 if (!buf1)
9087 return NULL;
9088 if (kind2 != kind)
9089 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9090 if (!buf2) {
9091 if (kind1 != kind) PyMem_Free(buf1);
9092 return NULL;
9093 }
9094 len1 = PyUnicode_GET_LENGTH(self);
9095 len2 = PyUnicode_GET_LENGTH(substring);
9096
9097 switch(kind) {
9098 case PyUnicode_1BYTE_KIND:
9099 out = ucs1lib_rsplit(
9100 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9101 break;
9102 case PyUnicode_2BYTE_KIND:
9103 out = ucs2lib_rsplit(
9104 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9105 break;
9106 case PyUnicode_4BYTE_KIND:
9107 out = ucs4lib_rsplit(
9108 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9109 break;
9110 default:
9111 out = NULL;
9112 }
9113 if (kind1 != kind)
9114 PyMem_Free(buf1);
9115 if (kind2 != kind)
9116 PyMem_Free(buf2);
9117 return out;
9118}
9119
9120static Py_ssize_t
9121anylib_find(int kind, void *buf1, Py_ssize_t len1,
9122 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9123{
9124 switch(kind) {
9125 case PyUnicode_1BYTE_KIND:
9126 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9127 case PyUnicode_2BYTE_KIND:
9128 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9129 case PyUnicode_4BYTE_KIND:
9130 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9131 }
9132 assert(0);
9133 return -1;
9134}
9135
9136static Py_ssize_t
9137anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9138 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9139{
9140 switch(kind) {
9141 case PyUnicode_1BYTE_KIND:
9142 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9143 case PyUnicode_2BYTE_KIND:
9144 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9145 case PyUnicode_4BYTE_KIND:
9146 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9147 }
9148 assert(0);
9149 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009150}
9151
Alexander Belopolsky40018472011-02-26 01:02:56 +00009152static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153replace(PyObject *self, PyObject *str1,
9154 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 PyObject *u;
9157 char *sbuf = PyUnicode_DATA(self);
9158 char *buf1 = PyUnicode_DATA(str1);
9159 char *buf2 = PyUnicode_DATA(str2);
9160 int srelease = 0, release1 = 0, release2 = 0;
9161 int skind = PyUnicode_KIND(self);
9162 int kind1 = PyUnicode_KIND(str1);
9163 int kind2 = PyUnicode_KIND(str2);
9164 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9165 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9166 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167
9168 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009169 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009171 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 if (skind < kind1)
9174 /* substring too wide to be present */
9175 goto nothing;
9176
9177 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009178 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009179 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009181 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009183 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 Py_UCS4 u1, u2, maxchar;
9185 int mayshrink, rkind;
9186 u1 = PyUnicode_READ_CHAR(str1, 0);
9187 if (!findchar(sbuf, PyUnicode_KIND(self),
9188 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 u2 = PyUnicode_READ_CHAR(str2, 0);
9191 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9192 /* Replacing u1 with u2 may cause a maxchar reduction in the
9193 result string. */
9194 mayshrink = maxchar > 127;
9195 if (u2 > maxchar) {
9196 maxchar = u2;
9197 mayshrink = 0;
9198 }
9199 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009200 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009202 if (PyUnicode_CopyCharacters(u, 0,
9203 (PyObject*)self, 0, slen) < 0)
9204 {
9205 Py_DECREF(u);
9206 return NULL;
9207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 rkind = PyUnicode_KIND(u);
9209 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9210 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009211 if (--maxcount < 0)
9212 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215 if (mayshrink) {
9216 PyObject *tmp = u;
9217 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9218 PyUnicode_GET_LENGTH(tmp));
9219 Py_DECREF(tmp);
9220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 int rkind = skind;
9223 char *res;
9224 if (kind1 < rkind) {
9225 /* widen substring */
9226 buf1 = _PyUnicode_AsKind(str1, rkind);
9227 if (!buf1) goto error;
9228 release1 = 1;
9229 }
9230 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009231 if (i < 0)
9232 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 if (rkind > kind2) {
9234 /* widen replacement */
9235 buf2 = _PyUnicode_AsKind(str2, rkind);
9236 if (!buf2) goto error;
9237 release2 = 1;
9238 }
9239 else if (rkind < kind2) {
9240 /* widen self and buf1 */
9241 rkind = kind2;
9242 if (release1) PyMem_Free(buf1);
9243 sbuf = _PyUnicode_AsKind(self, rkind);
9244 if (!sbuf) goto error;
9245 srelease = 1;
9246 buf1 = _PyUnicode_AsKind(str1, rkind);
9247 if (!buf1) goto error;
9248 release1 = 1;
9249 }
9250 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9251 if (!res) {
9252 PyErr_NoMemory();
9253 goto error;
9254 }
9255 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009256 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9258 buf2,
9259 PyUnicode_KIND_SIZE(rkind, len2));
9260 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009261
9262 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9264 slen-i,
9265 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009266 if (i == -1)
9267 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9269 buf2,
9270 PyUnicode_KIND_SIZE(rkind, len2));
9271 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273
9274 u = PyUnicode_FromKindAndData(rkind, res, slen);
9275 PyMem_Free(res);
9276 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 Py_ssize_t n, i, j, ires;
9281 Py_ssize_t product, new_size;
9282 int rkind = skind;
9283 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 if (kind1 < rkind) {
9286 buf1 = _PyUnicode_AsKind(str1, rkind);
9287 if (!buf1) goto error;
9288 release1 = 1;
9289 }
9290 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009291 if (n == 0)
9292 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 if (kind2 < rkind) {
9294 buf2 = _PyUnicode_AsKind(str2, rkind);
9295 if (!buf2) goto error;
9296 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 else if (kind2 > rkind) {
9299 rkind = kind2;
9300 sbuf = _PyUnicode_AsKind(self, rkind);
9301 if (!sbuf) goto error;
9302 srelease = 1;
9303 if (release1) PyMem_Free(buf1);
9304 buf1 = _PyUnicode_AsKind(str1, rkind);
9305 if (!buf1) goto error;
9306 release1 = 1;
9307 }
9308 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9309 PyUnicode_GET_LENGTH(str1))); */
9310 product = n * (len2-len1);
9311 if ((product / (len2-len1)) != n) {
9312 PyErr_SetString(PyExc_OverflowError,
9313 "replace string is too long");
9314 goto error;
9315 }
9316 new_size = slen + product;
9317 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9318 PyErr_SetString(PyExc_OverflowError,
9319 "replace string is too long");
9320 goto error;
9321 }
9322 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9323 if (!res)
9324 goto error;
9325 ires = i = 0;
9326 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009327 while (n-- > 0) {
9328 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 j = anylib_find(rkind,
9330 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9331 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009332 if (j == -1)
9333 break;
9334 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009335 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9337 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9338 PyUnicode_KIND_SIZE(rkind, j-i));
9339 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009340 }
9341 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 if (len2 > 0) {
9343 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9344 buf2,
9345 PyUnicode_KIND_SIZE(rkind, len2));
9346 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009351 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9353 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9354 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009355 } else {
9356 /* interleave */
9357 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9359 buf2,
9360 PyUnicode_KIND_SIZE(rkind, len2));
9361 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009362 if (--n <= 0)
9363 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9365 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9366 PyUnicode_KIND_SIZE(rkind, 1));
9367 ires++;
9368 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9371 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9372 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009375 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (srelease)
9378 PyMem_FREE(sbuf);
9379 if (release1)
9380 PyMem_FREE(buf1);
9381 if (release2)
9382 PyMem_FREE(buf2);
9383 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009384
Benjamin Peterson29060642009-01-31 22:14:21 +00009385 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009386 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 if (srelease)
9388 PyMem_FREE(sbuf);
9389 if (release1)
9390 PyMem_FREE(buf1);
9391 if (release2)
9392 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 if (PyUnicode_CheckExact(self)) {
9394 Py_INCREF(self);
9395 return (PyObject *) self;
9396 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009397 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 error:
9399 if (srelease && sbuf)
9400 PyMem_FREE(sbuf);
9401 if (release1 && buf1)
9402 PyMem_FREE(buf1);
9403 if (release2 && buf2)
9404 PyMem_FREE(buf2);
9405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406}
9407
9408/* --- Unicode Object Methods --------------------------------------------- */
9409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009410PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412\n\
9413Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009414characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415
9416static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009417unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 return fixup(self, fixtitle);
9420}
9421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009422PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424\n\
9425Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009426have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427
9428static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009429unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431 return fixup(self, fixcapitalize);
9432}
9433
9434#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009435PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009436 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437\n\
9438Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009439normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440
9441static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009442unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443{
9444 PyObject *list;
9445 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009446 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 /* Split into words */
9449 list = split(self, NULL, -1);
9450 if (!list)
9451 return NULL;
9452
9453 /* Capitalize each word */
9454 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9455 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457 if (item == NULL)
9458 goto onError;
9459 Py_DECREF(PyList_GET_ITEM(list, i));
9460 PyList_SET_ITEM(list, i, item);
9461 }
9462
9463 /* Join the words to form a new string */
9464 item = PyUnicode_Join(NULL, list);
9465
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 Py_DECREF(list);
9468 return (PyObject *)item;
9469}
9470#endif
9471
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009472/* Argument converter. Coerces to a single unicode character */
9473
9474static int
9475convert_uc(PyObject *obj, void *addr)
9476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009478 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009479
Benjamin Peterson14339b62009-01-31 16:36:08 +00009480 uniobj = PyUnicode_FromObject(obj);
9481 if (uniobj == NULL) {
9482 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009484 return 0;
9485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009487 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009489 Py_DECREF(uniobj);
9490 return 0;
9491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009493 Py_DECREF(uniobj);
9494 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009495}
9496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009497PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009500Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009501done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502
9503static PyObject *
9504unicode_center(PyUnicodeObject *self, PyObject *args)
9505{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009506 Py_ssize_t marg, left;
9507 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 Py_UCS4 fillchar = ' ';
9509
Victor Stinnere9a29352011-10-01 02:14:59 +02009510 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512
Victor Stinnere9a29352011-10-01 02:14:59 +02009513 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 return NULL;
9515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517 Py_INCREF(self);
9518 return (PyObject*) self;
9519 }
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 left = marg / 2 + (marg & width & 1);
9523
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009524 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525}
9526
Marc-André Lemburge5034372000-08-08 08:04:29 +00009527#if 0
9528
9529/* This code should go into some future Unicode collation support
9530 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009531 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009532
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009533/* speedy UTF-16 code point order comparison */
9534/* gleaned from: */
9535/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9536
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009537static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009538{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009539 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009540 0, 0, 0, 0, 0, 0, 0, 0,
9541 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009542 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009543};
9544
Guido van Rossumd57fd912000-03-10 22:53:23 +00009545static int
9546unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9547{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009548 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009549
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550 Py_UNICODE *s1 = str1->str;
9551 Py_UNICODE *s2 = str2->str;
9552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 len1 = str1->_base._base.length;
9554 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009555
Guido van Rossumd57fd912000-03-10 22:53:23 +00009556 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009557 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009558
9559 c1 = *s1++;
9560 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009561
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 if (c1 > (1<<11) * 26)
9563 c1 += utf16Fixup[c1>>11];
9564 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009565 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009566 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009567
9568 if (c1 != c2)
9569 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009570
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009571 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572 }
9573
9574 return (len1 < len2) ? -1 : (len1 != len2);
9575}
9576
Marc-André Lemburge5034372000-08-08 08:04:29 +00009577#else
9578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579/* This function assumes that str1 and str2 are readied by the caller. */
9580
Marc-André Lemburge5034372000-08-08 08:04:29 +00009581static int
9582unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9583{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 int kind1, kind2;
9585 void *data1, *data2;
9586 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 kind1 = PyUnicode_KIND(str1);
9589 kind2 = PyUnicode_KIND(str2);
9590 data1 = PyUnicode_DATA(str1);
9591 data2 = PyUnicode_DATA(str2);
9592 len1 = PyUnicode_GET_LENGTH(str1);
9593 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 for (i = 0; i < len1 && i < len2; ++i) {
9596 Py_UCS4 c1, c2;
9597 c1 = PyUnicode_READ(kind1, data1, i);
9598 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009599
9600 if (c1 != c2)
9601 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009602 }
9603
9604 return (len1 < len2) ? -1 : (len1 != len2);
9605}
9606
9607#endif
9608
Alexander Belopolsky40018472011-02-26 01:02:56 +00009609int
9610PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9613 if (PyUnicode_READY(left) == -1 ||
9614 PyUnicode_READY(right) == -1)
9615 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009616 return unicode_compare((PyUnicodeObject *)left,
9617 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009619 PyErr_Format(PyExc_TypeError,
9620 "Can't compare %.100s and %.100s",
9621 left->ob_type->tp_name,
9622 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 return -1;
9624}
9625
Martin v. Löwis5b222132007-06-10 09:51:05 +00009626int
9627PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9628{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 Py_ssize_t i;
9630 int kind;
9631 void *data;
9632 Py_UCS4 chr;
9633
Victor Stinner910337b2011-10-03 03:20:16 +02009634 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 if (PyUnicode_READY(uni) == -1)
9636 return -1;
9637 kind = PyUnicode_KIND(uni);
9638 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009639 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9641 if (chr != str[i])
9642 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009643 /* This check keeps Python strings that end in '\0' from comparing equal
9644 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009647 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009649 return 0;
9650}
9651
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009652
Benjamin Peterson29060642009-01-31 22:14:21 +00009653#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009654 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009655
Alexander Belopolsky40018472011-02-26 01:02:56 +00009656PyObject *
9657PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009658{
9659 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009660
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009661 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9662 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 if (PyUnicode_READY(left) == -1 ||
9664 PyUnicode_READY(right) == -1)
9665 return NULL;
9666 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9667 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009668 if (op == Py_EQ) {
9669 Py_INCREF(Py_False);
9670 return Py_False;
9671 }
9672 if (op == Py_NE) {
9673 Py_INCREF(Py_True);
9674 return Py_True;
9675 }
9676 }
9677 if (left == right)
9678 result = 0;
9679 else
9680 result = unicode_compare((PyUnicodeObject *)left,
9681 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009682
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009683 /* Convert the return value to a Boolean */
9684 switch (op) {
9685 case Py_EQ:
9686 v = TEST_COND(result == 0);
9687 break;
9688 case Py_NE:
9689 v = TEST_COND(result != 0);
9690 break;
9691 case Py_LE:
9692 v = TEST_COND(result <= 0);
9693 break;
9694 case Py_GE:
9695 v = TEST_COND(result >= 0);
9696 break;
9697 case Py_LT:
9698 v = TEST_COND(result == -1);
9699 break;
9700 case Py_GT:
9701 v = TEST_COND(result == 1);
9702 break;
9703 default:
9704 PyErr_BadArgument();
9705 return NULL;
9706 }
9707 Py_INCREF(v);
9708 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009710
Brian Curtindfc80e32011-08-10 20:28:54 -05009711 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009712}
9713
Alexander Belopolsky40018472011-02-26 01:02:56 +00009714int
9715PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009716{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009717 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 int kind1, kind2, kind;
9719 void *buf1, *buf2;
9720 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009721 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009722
9723 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009724 sub = PyUnicode_FromObject(element);
9725 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009726 PyErr_Format(PyExc_TypeError,
9727 "'in <string>' requires string as left operand, not %s",
9728 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009729 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 if (PyUnicode_READY(sub) == -1)
9732 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009733
Thomas Wouters477c8d52006-05-27 19:21:47 +00009734 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009735 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009736 Py_DECREF(sub);
9737 return -1;
9738 }
9739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 kind1 = PyUnicode_KIND(str);
9741 kind2 = PyUnicode_KIND(sub);
9742 kind = kind1 > kind2 ? kind1 : kind2;
9743 buf1 = PyUnicode_DATA(str);
9744 buf2 = PyUnicode_DATA(sub);
9745 if (kind1 != kind)
9746 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9747 if (!buf1) {
9748 Py_DECREF(sub);
9749 return -1;
9750 }
9751 if (kind2 != kind)
9752 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9753 if (!buf2) {
9754 Py_DECREF(sub);
9755 if (kind1 != kind) PyMem_Free(buf1);
9756 return -1;
9757 }
9758 len1 = PyUnicode_GET_LENGTH(str);
9759 len2 = PyUnicode_GET_LENGTH(sub);
9760
9761 switch(kind) {
9762 case PyUnicode_1BYTE_KIND:
9763 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9764 break;
9765 case PyUnicode_2BYTE_KIND:
9766 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9767 break;
9768 case PyUnicode_4BYTE_KIND:
9769 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9770 break;
9771 default:
9772 result = -1;
9773 assert(0);
9774 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009775
9776 Py_DECREF(str);
9777 Py_DECREF(sub);
9778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 if (kind1 != kind)
9780 PyMem_Free(buf1);
9781 if (kind2 != kind)
9782 PyMem_Free(buf2);
9783
Guido van Rossum403d68b2000-03-13 15:55:09 +00009784 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009785}
9786
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787/* Concat to string or Unicode object giving a new Unicode object. */
9788
Alexander Belopolsky40018472011-02-26 01:02:56 +00009789PyObject *
9790PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 PyObject *u = NULL, *v = NULL, *w;
9793 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794
9795 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009798 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802
9803 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009804 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009805 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009808 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 }
9812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009814 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 w = PyUnicode_New(
9818 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9819 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009821 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009822 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9823 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009824 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009825 v, 0,
9826 PyUnicode_GET_LENGTH(v)) < 0)
9827 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 Py_DECREF(u);
9829 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833 Py_XDECREF(u);
9834 Py_XDECREF(v);
9835 return NULL;
9836}
9837
Walter Dörwald1ab83302007-05-18 17:15:44 +00009838void
Victor Stinner23e56682011-10-03 03:54:37 +02009839PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009840{
Victor Stinner23e56682011-10-03 03:54:37 +02009841 PyObject *left, *res;
9842
9843 if (p_left == NULL) {
9844 if (!PyErr_Occurred())
9845 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009846 return;
9847 }
Victor Stinner23e56682011-10-03 03:54:37 +02009848 left = *p_left;
9849 if (right == NULL || !PyUnicode_Check(left)) {
9850 if (!PyErr_Occurred())
9851 PyErr_BadInternalCall();
9852 goto error;
9853 }
9854
9855 if (PyUnicode_CheckExact(left) && left != unicode_empty
9856 && PyUnicode_CheckExact(right) && right != unicode_empty
9857 && unicode_resizable(left)
9858 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9859 || _PyUnicode_WSTR(left) != NULL))
9860 {
9861 Py_ssize_t u_len, v_len, new_len, copied;
9862
9863 /* FIXME: don't make wstr string ready */
9864 if (PyUnicode_READY(left))
9865 goto error;
9866 if (PyUnicode_READY(right))
9867 goto error;
9868
9869 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9870 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9871 {
9872 u_len = PyUnicode_GET_LENGTH(left);
9873 v_len = PyUnicode_GET_LENGTH(right);
9874 if (u_len > PY_SSIZE_T_MAX - v_len) {
9875 PyErr_SetString(PyExc_OverflowError,
9876 "strings are too large to concat");
9877 goto error;
9878 }
9879 new_len = u_len + v_len;
9880
9881 /* Now we own the last reference to 'left', so we can resize it
9882 * in-place.
9883 */
9884 if (unicode_resize(&left, new_len) != 0) {
9885 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9886 * deallocated so it cannot be put back into
9887 * 'variable'. The MemoryError is raised when there
9888 * is no value in 'variable', which might (very
9889 * remotely) be a cause of incompatibilities.
9890 */
9891 goto error;
9892 }
9893 /* copy 'right' into the newly allocated area of 'left' */
9894 copied = PyUnicode_CopyCharacters(left, u_len,
9895 right, 0,
9896 v_len);
9897 assert(0 <= copied);
9898 *p_left = left;
9899 return;
9900 }
9901 }
9902
9903 res = PyUnicode_Concat(left, right);
9904 if (res == NULL)
9905 goto error;
9906 Py_DECREF(left);
9907 *p_left = res;
9908 return;
9909
9910error:
9911 Py_DECREF(*p_left);
9912 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009913}
9914
9915void
9916PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9917{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 PyUnicode_Append(pleft, right);
9919 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009920}
9921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009922PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009925Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009926string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009927interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928
9929static PyObject *
9930unicode_count(PyUnicodeObject *self, PyObject *args)
9931{
9932 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009933 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009934 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 int kind1, kind2, kind;
9937 void *buf1, *buf2;
9938 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939
Jesus Ceaac451502011-04-20 17:09:23 +02009940 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9941 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 kind1 = PyUnicode_KIND(self);
9945 kind2 = PyUnicode_KIND(substring);
9946 kind = kind1 > kind2 ? kind1 : kind2;
9947 buf1 = PyUnicode_DATA(self);
9948 buf2 = PyUnicode_DATA(substring);
9949 if (kind1 != kind)
9950 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9951 if (!buf1) {
9952 Py_DECREF(substring);
9953 return NULL;
9954 }
9955 if (kind2 != kind)
9956 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9957 if (!buf2) {
9958 Py_DECREF(substring);
9959 if (kind1 != kind) PyMem_Free(buf1);
9960 return NULL;
9961 }
9962 len1 = PyUnicode_GET_LENGTH(self);
9963 len2 = PyUnicode_GET_LENGTH(substring);
9964
9965 ADJUST_INDICES(start, end, len1);
9966 switch(kind) {
9967 case PyUnicode_1BYTE_KIND:
9968 iresult = ucs1lib_count(
9969 ((Py_UCS1*)buf1) + start, end - start,
9970 buf2, len2, PY_SSIZE_T_MAX
9971 );
9972 break;
9973 case PyUnicode_2BYTE_KIND:
9974 iresult = ucs2lib_count(
9975 ((Py_UCS2*)buf1) + start, end - start,
9976 buf2, len2, PY_SSIZE_T_MAX
9977 );
9978 break;
9979 case PyUnicode_4BYTE_KIND:
9980 iresult = ucs4lib_count(
9981 ((Py_UCS4*)buf1) + start, end - start,
9982 buf2, len2, PY_SSIZE_T_MAX
9983 );
9984 break;
9985 default:
9986 assert(0); iresult = 0;
9987 }
9988
9989 result = PyLong_FromSsize_t(iresult);
9990
9991 if (kind1 != kind)
9992 PyMem_Free(buf1);
9993 if (kind2 != kind)
9994 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995
9996 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009997
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998 return result;
9999}
10000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010001PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010002 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010004Encode S using the codec registered for encoding. Default encoding\n\
10005is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010006handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010007a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10008'xmlcharrefreplace' as well as any other name registered with\n\
10009codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010
10011static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010012unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010014 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015 char *encoding = NULL;
10016 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010017
Benjamin Peterson308d6372009-09-18 21:42:35 +000010018 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10019 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010021 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010022}
10023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010024PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026\n\
10027Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010028If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029
10030static PyObject*
10031unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10032{
10033 Py_UNICODE *e;
10034 Py_UNICODE *p;
10035 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010036 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038 PyUnicodeObject *u;
10039 int tabsize = 8;
10040
10041 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10045 return NULL;
10046
Thomas Wouters7e474022000-07-16 12:04:32 +000010047 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010048 i = 0; /* chars up to and including most recent \n or \r */
10049 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10051 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 if (tabsize > 0) {
10054 incr = tabsize - (j % tabsize); /* cannot overflow */
10055 if (j > PY_SSIZE_T_MAX - incr)
10056 goto overflow1;
10057 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010058 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 if (j > PY_SSIZE_T_MAX - 1)
10062 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063 j++;
10064 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 if (i > PY_SSIZE_T_MAX - j)
10066 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010068 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 }
10070 }
10071
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010072 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010074
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 /* Second pass: create output string and fill it */
10076 u = _PyUnicode_New(i + j);
10077 if (!u)
10078 return NULL;
10079
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010080 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 q = _PyUnicode_WSTR(u); /* next output char */
10082 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010086 if (tabsize > 0) {
10087 i = tabsize - (j % tabsize);
10088 j += i;
10089 while (i--) {
10090 if (q >= qe)
10091 goto overflow2;
10092 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010093 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010094 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010095 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010096 else {
10097 if (q >= qe)
10098 goto overflow2;
10099 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010100 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101 if (*p == '\n' || *p == '\r')
10102 j = 0;
10103 }
10104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 if (PyUnicode_READY(u) == -1) {
10106 Py_DECREF(u);
10107 return NULL;
10108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010110
10111 overflow2:
10112 Py_DECREF(u);
10113 overflow1:
10114 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116}
10117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010118PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010119 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120\n\
10121Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010122such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123arguments start and end are interpreted as in slice notation.\n\
10124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010125Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126
10127static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129{
Jesus Ceaac451502011-04-20 17:09:23 +020010130 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010131 Py_ssize_t start;
10132 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010133 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
Jesus Ceaac451502011-04-20 17:09:23 +020010135 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10136 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 if (PyUnicode_READY(self) == -1)
10140 return NULL;
10141 if (PyUnicode_READY(substring) == -1)
10142 return NULL;
10143
10144 result = any_find_slice(
10145 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10146 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010147 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
10149 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (result == -2)
10152 return NULL;
10153
Christian Heimes217cfd12007-12-02 14:31:20 +000010154 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155}
10156
10157static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010158unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010160 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10161 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164}
10165
Guido van Rossumc2504932007-09-18 19:42:40 +000010166/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010167 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010168static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010169unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170{
Guido van Rossumc2504932007-09-18 19:42:40 +000010171 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010172 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (_PyUnicode_HASH(self) != -1)
10175 return _PyUnicode_HASH(self);
10176 if (PyUnicode_READY(self) == -1)
10177 return -1;
10178 len = PyUnicode_GET_LENGTH(self);
10179
10180 /* The hash function as a macro, gets expanded three times below. */
10181#define HASH(P) \
10182 x = (Py_uhash_t)*P << 7; \
10183 while (--len >= 0) \
10184 x = (1000003*x) ^ (Py_uhash_t)*P++;
10185
10186 switch (PyUnicode_KIND(self)) {
10187 case PyUnicode_1BYTE_KIND: {
10188 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10189 HASH(c);
10190 break;
10191 }
10192 case PyUnicode_2BYTE_KIND: {
10193 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10194 HASH(s);
10195 break;
10196 }
10197 default: {
10198 Py_UCS4 *l;
10199 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10200 "Impossible switch case in unicode_hash");
10201 l = PyUnicode_4BYTE_DATA(self);
10202 HASH(l);
10203 break;
10204 }
10205 }
10206 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10207
Guido van Rossumc2504932007-09-18 19:42:40 +000010208 if (x == -1)
10209 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010211 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010215PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010218Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219
10220static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010223 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010224 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010225 Py_ssize_t start;
10226 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227
Jesus Ceaac451502011-04-20 17:09:23 +020010228 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10229 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 if (PyUnicode_READY(self) == -1)
10233 return NULL;
10234 if (PyUnicode_READY(substring) == -1)
10235 return NULL;
10236
10237 result = any_find_slice(
10238 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10239 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241
10242 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (result == -2)
10245 return NULL;
10246
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247 if (result < 0) {
10248 PyErr_SetString(PyExc_ValueError, "substring not found");
10249 return NULL;
10250 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010251
Christian Heimes217cfd12007-12-02 14:31:20 +000010252 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253}
10254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010255PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010258Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010259at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
10261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010262unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 Py_ssize_t i, length;
10265 int kind;
10266 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267 int cased;
10268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (PyUnicode_READY(self) == -1)
10270 return NULL;
10271 length = PyUnicode_GET_LENGTH(self);
10272 kind = PyUnicode_KIND(self);
10273 data = PyUnicode_DATA(self);
10274
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 if (length == 1)
10277 return PyBool_FromLong(
10278 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010280 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010282 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010283
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 for (i = 0; i < length; i++) {
10286 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010287
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10289 return PyBool_FromLong(0);
10290 else if (!cased && Py_UNICODE_ISLOWER(ch))
10291 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010293 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294}
10295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010296PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010299Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010300at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
10302static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010303unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 Py_ssize_t i, length;
10306 int kind;
10307 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 int cased;
10309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (PyUnicode_READY(self) == -1)
10311 return NULL;
10312 length = PyUnicode_GET_LENGTH(self);
10313 kind = PyUnicode_KIND(self);
10314 data = PyUnicode_DATA(self);
10315
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (length == 1)
10318 return PyBool_FromLong(
10319 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010321 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010323 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010324
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 for (i = 0; i < length; i++) {
10327 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010328
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10330 return PyBool_FromLong(0);
10331 else if (!cased && Py_UNICODE_ISUPPER(ch))
10332 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010334 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335}
10336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010337PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010338 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010340Return True if S is a titlecased string and there is at least one\n\
10341character in S, i.e. upper- and titlecase characters may only\n\
10342follow uncased characters and lowercase characters only cased ones.\n\
10343Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344
10345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010346unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 Py_ssize_t i, length;
10349 int kind;
10350 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351 int cased, previous_is_cased;
10352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (PyUnicode_READY(self) == -1)
10354 return NULL;
10355 length = PyUnicode_GET_LENGTH(self);
10356 kind = PyUnicode_KIND(self);
10357 data = PyUnicode_DATA(self);
10358
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (length == 1) {
10361 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10362 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10363 (Py_UNICODE_ISUPPER(ch) != 0));
10364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010366 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010368 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010369
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370 cased = 0;
10371 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 for (i = 0; i < length; i++) {
10373 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010374
Benjamin Peterson29060642009-01-31 22:14:21 +000010375 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10376 if (previous_is_cased)
10377 return PyBool_FromLong(0);
10378 previous_is_cased = 1;
10379 cased = 1;
10380 }
10381 else if (Py_UNICODE_ISLOWER(ch)) {
10382 if (!previous_is_cased)
10383 return PyBool_FromLong(0);
10384 previous_is_cased = 1;
10385 cased = 1;
10386 }
10387 else
10388 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010390 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391}
10392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010393PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010396Return True if all characters in S are whitespace\n\
10397and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398
10399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010400unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 Py_ssize_t i, length;
10403 int kind;
10404 void *data;
10405
10406 if (PyUnicode_READY(self) == -1)
10407 return NULL;
10408 length = PyUnicode_GET_LENGTH(self);
10409 kind = PyUnicode_KIND(self);
10410 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (length == 1)
10414 return PyBool_FromLong(
10415 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010417 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 for (i = 0; i < length; i++) {
10422 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010423 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010426 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427}
10428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010429PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010431\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010432Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010433and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010434
10435static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010436unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 Py_ssize_t i, length;
10439 int kind;
10440 void *data;
10441
10442 if (PyUnicode_READY(self) == -1)
10443 return NULL;
10444 length = PyUnicode_GET_LENGTH(self);
10445 kind = PyUnicode_KIND(self);
10446 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010447
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010448 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (length == 1)
10450 return PyBool_FromLong(
10451 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010452
10453 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010455 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 for (i = 0; i < length; i++) {
10458 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010460 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010461 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010462}
10463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010464PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010466\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010467Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010468and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010469
10470static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010471unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 int kind;
10474 void *data;
10475 Py_ssize_t len, i;
10476
10477 if (PyUnicode_READY(self) == -1)
10478 return NULL;
10479
10480 kind = PyUnicode_KIND(self);
10481 data = PyUnicode_DATA(self);
10482 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010483
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010484 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 if (len == 1) {
10486 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10487 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10488 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010489
10490 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010492 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 for (i = 0; i < len; i++) {
10495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010496 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010498 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010499 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010500}
10501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010502PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010505Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507
10508static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010509unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010510{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 Py_ssize_t i, length;
10512 int kind;
10513 void *data;
10514
10515 if (PyUnicode_READY(self) == -1)
10516 return NULL;
10517 length = PyUnicode_GET_LENGTH(self);
10518 kind = PyUnicode_KIND(self);
10519 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (length == 1)
10523 return PyBool_FromLong(
10524 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010526 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 for (i = 0; i < length; i++) {
10531 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010534 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535}
10536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010537PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010540Return True if all characters in S are digits\n\
10541and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542
10543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010544unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 Py_ssize_t i, length;
10547 int kind;
10548 void *data;
10549
10550 if (PyUnicode_READY(self) == -1)
10551 return NULL;
10552 length = PyUnicode_GET_LENGTH(self);
10553 kind = PyUnicode_KIND(self);
10554 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 if (length == 1) {
10558 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10559 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010562 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 for (i = 0; i < length; i++) {
10567 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010570 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571}
10572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010573PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010576Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010577False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578
10579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010580unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 Py_ssize_t i, length;
10583 int kind;
10584 void *data;
10585
10586 if (PyUnicode_READY(self) == -1)
10587 return NULL;
10588 length = PyUnicode_GET_LENGTH(self);
10589 kind = PyUnicode_KIND(self);
10590 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (length == 1)
10594 return PyBool_FromLong(
10595 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010597 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 for (i = 0; i < length; i++) {
10602 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010603 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010605 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606}
10607
Martin v. Löwis47383402007-08-15 07:32:56 +000010608int
10609PyUnicode_IsIdentifier(PyObject *self)
10610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 int kind;
10612 void *data;
10613 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010614 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 if (PyUnicode_READY(self) == -1) {
10617 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 }
10620
10621 /* Special case for empty strings */
10622 if (PyUnicode_GET_LENGTH(self) == 0)
10623 return 0;
10624 kind = PyUnicode_KIND(self);
10625 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010626
10627 /* PEP 3131 says that the first character must be in
10628 XID_Start and subsequent characters in XID_Continue,
10629 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010630 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010631 letters, digits, underscore). However, given the current
10632 definition of XID_Start and XID_Continue, it is sufficient
10633 to check just for these, except that _ must be allowed
10634 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010636 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010637 return 0;
10638
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010639 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010641 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010642 return 1;
10643}
10644
10645PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010647\n\
10648Return True if S is a valid identifier according\n\
10649to the language definition.");
10650
10651static PyObject*
10652unicode_isidentifier(PyObject *self)
10653{
10654 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10655}
10656
Georg Brandl559e5d72008-06-11 18:37:52 +000010657PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010659\n\
10660Return True if all characters in S are considered\n\
10661printable in repr() or S is empty, False otherwise.");
10662
10663static PyObject*
10664unicode_isprintable(PyObject *self)
10665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 Py_ssize_t i, length;
10667 int kind;
10668 void *data;
10669
10670 if (PyUnicode_READY(self) == -1)
10671 return NULL;
10672 length = PyUnicode_GET_LENGTH(self);
10673 kind = PyUnicode_KIND(self);
10674 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010675
10676 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (length == 1)
10678 return PyBool_FromLong(
10679 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 for (i = 0; i < length; i++) {
10682 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010683 Py_RETURN_FALSE;
10684 }
10685 }
10686 Py_RETURN_TRUE;
10687}
10688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010689PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010690 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691\n\
10692Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010693iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694
10695static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010696unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010698 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699}
10700
Martin v. Löwis18e16552006-02-15 17:27:45 +000010701static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702unicode_length(PyUnicodeObject *self)
10703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 if (PyUnicode_READY(self) == -1)
10705 return -1;
10706 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707}
10708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010709PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010710 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010712Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010713done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715static PyObject *
10716unicode_ljust(PyUnicodeObject *self, PyObject *args)
10717{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010718 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 Py_UCS4 fillchar = ' ';
10720
10721 if (PyUnicode_READY(self) == -1)
10722 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010723
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010724 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725 return NULL;
10726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 Py_INCREF(self);
10729 return (PyObject*) self;
10730 }
10731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733}
10734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010735PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010738Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739
10740static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010741unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 return fixup(self, fixlower);
10744}
10745
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010746#define LEFTSTRIP 0
10747#define RIGHTSTRIP 1
10748#define BOTHSTRIP 2
10749
10750/* Arrays indexed by above */
10751static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10752
10753#define STRIPNAME(i) (stripformat[i]+3)
10754
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010755/* externally visible for str.strip(unicode) */
10756PyObject *
10757_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 void *data;
10760 int kind;
10761 Py_ssize_t i, j, len;
10762 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10765 return NULL;
10766
10767 kind = PyUnicode_KIND(self);
10768 data = PyUnicode_DATA(self);
10769 len = PyUnicode_GET_LENGTH(self);
10770 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10771 PyUnicode_DATA(sepobj),
10772 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010773
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 i = 0;
10775 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 while (i < len &&
10777 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010778 i++;
10779 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010780 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010781
Benjamin Peterson14339b62009-01-31 16:36:08 +000010782 j = len;
10783 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 do {
10785 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 } while (j >= i &&
10787 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010789 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010790
Victor Stinner12bab6d2011-10-01 01:53:49 +020010791 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792}
10793
10794PyObject*
10795PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10796{
10797 unsigned char *data;
10798 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010799 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800
Victor Stinnerde636f32011-10-01 03:55:54 +020010801 if (PyUnicode_READY(self) == -1)
10802 return NULL;
10803
10804 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10805
Victor Stinner12bab6d2011-10-01 01:53:49 +020010806 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010808 if (PyUnicode_CheckExact(self)) {
10809 Py_INCREF(self);
10810 return self;
10811 }
10812 else
10813 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 }
10815
Victor Stinner12bab6d2011-10-01 01:53:49 +020010816 length = end - start;
10817 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010818 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819
Victor Stinnerde636f32011-10-01 03:55:54 +020010820 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010821 PyErr_SetString(PyExc_IndexError, "string index out of range");
10822 return NULL;
10823 }
10824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010825 kind = PyUnicode_KIND(self);
10826 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010827 return PyUnicode_FromKindAndData(kind,
10828 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010829 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
10832static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010833do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 int kind;
10836 void *data;
10837 Py_ssize_t len, i, j;
10838
10839 if (PyUnicode_READY(self) == -1)
10840 return NULL;
10841
10842 kind = PyUnicode_KIND(self);
10843 data = PyUnicode_DATA(self);
10844 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010845
Benjamin Peterson14339b62009-01-31 16:36:08 +000010846 i = 0;
10847 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010849 i++;
10850 }
10851 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010852
Benjamin Peterson14339b62009-01-31 16:36:08 +000010853 j = len;
10854 if (striptype != LEFTSTRIP) {
10855 do {
10856 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010858 j++;
10859 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010860
Victor Stinner12bab6d2011-10-01 01:53:49 +020010861 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862}
10863
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010864
10865static PyObject *
10866do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10867{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010868 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010869
Benjamin Peterson14339b62009-01-31 16:36:08 +000010870 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10871 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010872
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873 if (sep != NULL && sep != Py_None) {
10874 if (PyUnicode_Check(sep))
10875 return _PyUnicode_XStrip(self, striptype, sep);
10876 else {
10877 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 "%s arg must be None or str",
10879 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010880 return NULL;
10881 }
10882 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010883
Benjamin Peterson14339b62009-01-31 16:36:08 +000010884 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010885}
10886
10887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010888PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010890\n\
10891Return a copy of the string S with leading and trailing\n\
10892whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010893If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010894
10895static PyObject *
10896unicode_strip(PyUnicodeObject *self, PyObject *args)
10897{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010898 if (PyTuple_GET_SIZE(args) == 0)
10899 return do_strip(self, BOTHSTRIP); /* Common case */
10900 else
10901 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010902}
10903
10904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010907\n\
10908Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010909If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010910
10911static PyObject *
10912unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10913{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010914 if (PyTuple_GET_SIZE(args) == 0)
10915 return do_strip(self, LEFTSTRIP); /* Common case */
10916 else
10917 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010918}
10919
10920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010921PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010922 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010923\n\
10924Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010925If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010926
10927static PyObject *
10928unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10929{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010930 if (PyTuple_GET_SIZE(args) == 0)
10931 return do_strip(self, RIGHTSTRIP); /* Common case */
10932 else
10933 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010934}
10935
10936
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010938unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939{
10940 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
Georg Brandl222de0f2009-04-12 12:01:50 +000010943 if (len < 1) {
10944 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010945 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
Tim Peters7a29bd52001-09-12 03:03:31 +000010948 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949 /* no repeat, return original string */
10950 Py_INCREF(str);
10951 return (PyObject*) str;
10952 }
Tim Peters8f422462000-09-09 06:13:41 +000010953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 if (PyUnicode_READY(str) == -1)
10955 return NULL;
10956
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010957 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010958 PyErr_SetString(PyExc_OverflowError,
10959 "repeated string is too long");
10960 return NULL;
10961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965 if (!u)
10966 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010967 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (PyUnicode_GET_LENGTH(str) == 1) {
10970 const int kind = PyUnicode_KIND(str);
10971 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10972 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010973 if (kind == PyUnicode_1BYTE_KIND)
10974 memset(to, (unsigned char)fill_char, len);
10975 else {
10976 for (n = 0; n < len; ++n)
10977 PyUnicode_WRITE(kind, to, n, fill_char);
10978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 }
10980 else {
10981 /* number of characters copied this far */
10982 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10983 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10984 char *to = (char *) PyUnicode_DATA(u);
10985 Py_MEMCPY(to, PyUnicode_DATA(str),
10986 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 n = (done <= nchars-done) ? done : nchars-done;
10989 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010990 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 }
10993
10994 return (PyObject*) u;
10995}
10996
Alexander Belopolsky40018472011-02-26 01:02:56 +000010997PyObject *
10998PyUnicode_Replace(PyObject *obj,
10999 PyObject *subobj,
11000 PyObject *replobj,
11001 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002{
11003 PyObject *self;
11004 PyObject *str1;
11005 PyObject *str2;
11006 PyObject *result;
11007
11008 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011009 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011012 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 Py_DECREF(self);
11014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 }
11016 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011017 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 Py_DECREF(self);
11019 Py_DECREF(str1);
11020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 Py_DECREF(self);
11024 Py_DECREF(str1);
11025 Py_DECREF(str2);
11026 return result;
11027}
11028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011029PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011030 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031\n\
11032Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011033old replaced by new. If the optional argument count is\n\
11034given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035
11036static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 PyObject *str1;
11040 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011041 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 PyObject *result;
11043
Martin v. Löwis18e16552006-02-15 17:27:45 +000011044 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 str1 = PyUnicode_FromObject(str1);
11049 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11050 return NULL;
11051 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011052 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 Py_DECREF(str1);
11054 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056
11057 result = replace(self, str1, str2, maxcount);
11058
11059 Py_DECREF(str1);
11060 Py_DECREF(str2);
11061 return result;
11062}
11063
Alexander Belopolsky40018472011-02-26 01:02:56 +000011064static PyObject *
11065unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011067 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 Py_ssize_t isize;
11069 Py_ssize_t osize, squote, dquote, i, o;
11070 Py_UCS4 max, quote;
11071 int ikind, okind;
11072 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011075 return NULL;
11076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 isize = PyUnicode_GET_LENGTH(unicode);
11078 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 /* Compute length of output, quote characters, and
11081 maximum character */
11082 osize = 2; /* quotes */
11083 max = 127;
11084 squote = dquote = 0;
11085 ikind = PyUnicode_KIND(unicode);
11086 for (i = 0; i < isize; i++) {
11087 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11088 switch (ch) {
11089 case '\'': squote++; osize++; break;
11090 case '"': dquote++; osize++; break;
11091 case '\\': case '\t': case '\r': case '\n':
11092 osize += 2; break;
11093 default:
11094 /* Fast-path ASCII */
11095 if (ch < ' ' || ch == 0x7f)
11096 osize += 4; /* \xHH */
11097 else if (ch < 0x7f)
11098 osize++;
11099 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11100 osize++;
11101 max = ch > max ? ch : max;
11102 }
11103 else if (ch < 0x100)
11104 osize += 4; /* \xHH */
11105 else if (ch < 0x10000)
11106 osize += 6; /* \uHHHH */
11107 else
11108 osize += 10; /* \uHHHHHHHH */
11109 }
11110 }
11111
11112 quote = '\'';
11113 if (squote) {
11114 if (dquote)
11115 /* Both squote and dquote present. Use squote,
11116 and escape them */
11117 osize += squote;
11118 else
11119 quote = '"';
11120 }
11121
11122 repr = PyUnicode_New(osize, max);
11123 if (repr == NULL)
11124 return NULL;
11125 okind = PyUnicode_KIND(repr);
11126 odata = PyUnicode_DATA(repr);
11127
11128 PyUnicode_WRITE(okind, odata, 0, quote);
11129 PyUnicode_WRITE(okind, odata, osize-1, quote);
11130
11131 for (i = 0, o = 1; i < isize; i++) {
11132 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011133
11134 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 if ((ch == quote) || (ch == '\\')) {
11136 PyUnicode_WRITE(okind, odata, o++, '\\');
11137 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011138 continue;
11139 }
11140
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011142 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 PyUnicode_WRITE(okind, odata, o++, '\\');
11144 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011145 }
11146 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 PyUnicode_WRITE(okind, odata, o++, '\\');
11148 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011149 }
11150 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 PyUnicode_WRITE(okind, odata, o++, '\\');
11152 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011153 }
11154
11155 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011156 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 PyUnicode_WRITE(okind, odata, o++, '\\');
11158 PyUnicode_WRITE(okind, odata, o++, 'x');
11159 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11160 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011161 }
11162
Georg Brandl559e5d72008-06-11 18:37:52 +000011163 /* Copy ASCII characters as-is */
11164 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011166 }
11167
Benjamin Peterson29060642009-01-31 22:14:21 +000011168 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011169 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011170 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011171 (categories Z* and C* except ASCII space)
11172 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011174 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 if (ch <= 0xff) {
11176 PyUnicode_WRITE(okind, odata, o++, '\\');
11177 PyUnicode_WRITE(okind, odata, o++, 'x');
11178 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11179 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011180 }
11181 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 else if (ch >= 0x10000) {
11183 PyUnicode_WRITE(okind, odata, o++, '\\');
11184 PyUnicode_WRITE(okind, odata, o++, 'U');
11185 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11186 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11187 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11188 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11189 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11190 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11191 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11192 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011193 }
11194 /* Map 16-bit characters to '\uxxxx' */
11195 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 PyUnicode_WRITE(okind, odata, o++, '\\');
11197 PyUnicode_WRITE(okind, odata, o++, 'u');
11198 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11199 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11200 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11201 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011202 }
11203 }
11204 /* Copy characters as-is */
11205 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011207 }
11208 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011211 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212}
11213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011214PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216\n\
11217Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011218such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219arguments start and end are interpreted as in slice notation.\n\
11220\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011221Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222
11223static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225{
Jesus Ceaac451502011-04-20 17:09:23 +020011226 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011227 Py_ssize_t start;
11228 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011229 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
Jesus Ceaac451502011-04-20 17:09:23 +020011231 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11232 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011233 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 if (PyUnicode_READY(self) == -1)
11236 return NULL;
11237 if (PyUnicode_READY(substring) == -1)
11238 return NULL;
11239
11240 result = any_find_slice(
11241 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11242 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011243 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
11245 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (result == -2)
11248 return NULL;
11249
Christian Heimes217cfd12007-12-02 14:31:20 +000011250 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251}
11252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011256Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257
11258static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260{
Jesus Ceaac451502011-04-20 17:09:23 +020011261 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011262 Py_ssize_t start;
11263 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011264 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
Jesus Ceaac451502011-04-20 17:09:23 +020011266 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11267 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (PyUnicode_READY(self) == -1)
11271 return NULL;
11272 if (PyUnicode_READY(substring) == -1)
11273 return NULL;
11274
11275 result = any_find_slice(
11276 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11277 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011278 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
11280 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 if (result == -2)
11283 return NULL;
11284
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 if (result < 0) {
11286 PyErr_SetString(PyExc_ValueError, "substring not found");
11287 return NULL;
11288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289
Christian Heimes217cfd12007-12-02 14:31:20 +000011290 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291}
11292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011296Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011297done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
11299static PyObject *
11300unicode_rjust(PyUnicodeObject *self, PyObject *args)
11301{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011302 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 Py_UCS4 fillchar = ' ';
11304
Victor Stinnere9a29352011-10-01 02:14:59 +020011305 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011307
Victor Stinnere9a29352011-10-01 02:14:59 +020011308 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 return NULL;
11310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 Py_INCREF(self);
11313 return (PyObject*) self;
11314 }
11315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317}
11318
Alexander Belopolsky40018472011-02-26 01:02:56 +000011319PyObject *
11320PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321{
11322 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011323
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 s = PyUnicode_FromObject(s);
11325 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011326 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 if (sep != NULL) {
11328 sep = PyUnicode_FromObject(sep);
11329 if (sep == NULL) {
11330 Py_DECREF(s);
11331 return NULL;
11332 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333 }
11334
11335 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11336
11337 Py_DECREF(s);
11338 Py_XDECREF(sep);
11339 return result;
11340}
11341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011342PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011343 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344\n\
11345Return a list of the words in S, using sep as the\n\
11346delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011347splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011348whitespace string is a separator and empty strings are\n\
11349removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
11351static PyObject*
11352unicode_split(PyUnicodeObject *self, PyObject *args)
11353{
11354 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011355 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
Martin v. Löwis18e16552006-02-15 17:27:45 +000011357 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 return NULL;
11359
11360 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366}
11367
Thomas Wouters477c8d52006-05-27 19:21:47 +000011368PyObject *
11369PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11370{
11371 PyObject* str_obj;
11372 PyObject* sep_obj;
11373 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 int kind1, kind2, kind;
11375 void *buf1 = NULL, *buf2 = NULL;
11376 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011377
11378 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011379 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011381 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011383 Py_DECREF(str_obj);
11384 return NULL;
11385 }
11386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 kind1 = PyUnicode_KIND(str_in);
11388 kind2 = PyUnicode_KIND(sep_obj);
11389 kind = kind1 > kind2 ? kind1 : kind2;
11390 buf1 = PyUnicode_DATA(str_in);
11391 if (kind1 != kind)
11392 buf1 = _PyUnicode_AsKind(str_in, kind);
11393 if (!buf1)
11394 goto onError;
11395 buf2 = PyUnicode_DATA(sep_obj);
11396 if (kind2 != kind)
11397 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11398 if (!buf2)
11399 goto onError;
11400 len1 = PyUnicode_GET_LENGTH(str_obj);
11401 len2 = PyUnicode_GET_LENGTH(sep_obj);
11402
11403 switch(PyUnicode_KIND(str_in)) {
11404 case PyUnicode_1BYTE_KIND:
11405 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11406 break;
11407 case PyUnicode_2BYTE_KIND:
11408 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11409 break;
11410 case PyUnicode_4BYTE_KIND:
11411 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11412 break;
11413 default:
11414 assert(0);
11415 out = 0;
11416 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011417
11418 Py_DECREF(sep_obj);
11419 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 if (kind1 != kind)
11421 PyMem_Free(buf1);
11422 if (kind2 != kind)
11423 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011424
11425 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 onError:
11427 Py_DECREF(sep_obj);
11428 Py_DECREF(str_obj);
11429 if (kind1 != kind && buf1)
11430 PyMem_Free(buf1);
11431 if (kind2 != kind && buf2)
11432 PyMem_Free(buf2);
11433 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011434}
11435
11436
11437PyObject *
11438PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11439{
11440 PyObject* str_obj;
11441 PyObject* sep_obj;
11442 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 int kind1, kind2, kind;
11444 void *buf1 = NULL, *buf2 = NULL;
11445 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011446
11447 str_obj = PyUnicode_FromObject(str_in);
11448 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450 sep_obj = PyUnicode_FromObject(sep_in);
11451 if (!sep_obj) {
11452 Py_DECREF(str_obj);
11453 return NULL;
11454 }
11455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 kind1 = PyUnicode_KIND(str_in);
11457 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011458 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 buf1 = PyUnicode_DATA(str_in);
11460 if (kind1 != kind)
11461 buf1 = _PyUnicode_AsKind(str_in, kind);
11462 if (!buf1)
11463 goto onError;
11464 buf2 = PyUnicode_DATA(sep_obj);
11465 if (kind2 != kind)
11466 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11467 if (!buf2)
11468 goto onError;
11469 len1 = PyUnicode_GET_LENGTH(str_obj);
11470 len2 = PyUnicode_GET_LENGTH(sep_obj);
11471
11472 switch(PyUnicode_KIND(str_in)) {
11473 case PyUnicode_1BYTE_KIND:
11474 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11475 break;
11476 case PyUnicode_2BYTE_KIND:
11477 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11478 break;
11479 case PyUnicode_4BYTE_KIND:
11480 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11481 break;
11482 default:
11483 assert(0);
11484 out = 0;
11485 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011486
11487 Py_DECREF(sep_obj);
11488 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (kind1 != kind)
11490 PyMem_Free(buf1);
11491 if (kind2 != kind)
11492 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011493
11494 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 onError:
11496 Py_DECREF(sep_obj);
11497 Py_DECREF(str_obj);
11498 if (kind1 != kind && buf1)
11499 PyMem_Free(buf1);
11500 if (kind2 != kind && buf2)
11501 PyMem_Free(buf2);
11502 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011503}
11504
11505PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011507\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011508Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011509the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011510found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011511
11512static PyObject*
11513unicode_partition(PyUnicodeObject *self, PyObject *separator)
11514{
11515 return PyUnicode_Partition((PyObject *)self, separator);
11516}
11517
11518PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011519 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011520\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011521Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011522the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011523separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011524
11525static PyObject*
11526unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11527{
11528 return PyUnicode_RPartition((PyObject *)self, separator);
11529}
11530
Alexander Belopolsky40018472011-02-26 01:02:56 +000011531PyObject *
11532PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011533{
11534 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011535
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011536 s = PyUnicode_FromObject(s);
11537 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011538 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 if (sep != NULL) {
11540 sep = PyUnicode_FromObject(sep);
11541 if (sep == NULL) {
11542 Py_DECREF(s);
11543 return NULL;
11544 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011545 }
11546
11547 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11548
11549 Py_DECREF(s);
11550 Py_XDECREF(sep);
11551 return result;
11552}
11553
11554PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011556\n\
11557Return a list of the words in S, using sep as the\n\
11558delimiter string, starting at the end of the string and\n\
11559working to the front. If maxsplit is given, at most maxsplit\n\
11560splits are done. If sep is not specified, any whitespace string\n\
11561is a separator.");
11562
11563static PyObject*
11564unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11565{
11566 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011567 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011568
Martin v. Löwis18e16552006-02-15 17:27:45 +000011569 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011570 return NULL;
11571
11572 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011574 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011576 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011578}
11579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582\n\
11583Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011584Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
11587static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011588unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011590 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011591 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011593 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11594 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 return NULL;
11596
Guido van Rossum86662912000-04-11 15:38:46 +000011597 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598}
11599
11600static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011601PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602{
Walter Dörwald346737f2007-05-31 10:44:43 +000011603 if (PyUnicode_CheckExact(self)) {
11604 Py_INCREF(self);
11605 return self;
11606 } else
11607 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011608 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609}
11610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613\n\
11614Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
11617static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011618unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620 return fixup(self, fixswapcase);
11621}
11622
Georg Brandlceee0772007-11-27 23:48:05 +000011623PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011625\n\
11626Return a translation table usable for str.translate().\n\
11627If there is only one argument, it must be a dictionary mapping Unicode\n\
11628ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011629Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011630If there are two arguments, they must be strings of equal length, and\n\
11631in the resulting dictionary, each character in x will be mapped to the\n\
11632character at the same position in y. If there is a third argument, it\n\
11633must be a string, whose characters will be mapped to None in the result.");
11634
11635static PyObject*
11636unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11637{
11638 PyObject *x, *y = NULL, *z = NULL;
11639 PyObject *new = NULL, *key, *value;
11640 Py_ssize_t i = 0;
11641 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011642
Georg Brandlceee0772007-11-27 23:48:05 +000011643 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11644 return NULL;
11645 new = PyDict_New();
11646 if (!new)
11647 return NULL;
11648 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 int x_kind, y_kind, z_kind;
11650 void *x_data, *y_data, *z_data;
11651
Georg Brandlceee0772007-11-27 23:48:05 +000011652 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011653 if (!PyUnicode_Check(x)) {
11654 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11655 "be a string if there is a second argument");
11656 goto err;
11657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011659 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11660 "arguments must have equal length");
11661 goto err;
11662 }
11663 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 x_kind = PyUnicode_KIND(x);
11665 y_kind = PyUnicode_KIND(y);
11666 x_data = PyUnicode_DATA(x);
11667 y_data = PyUnicode_DATA(y);
11668 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11669 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11670 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011671 if (!key || !value)
11672 goto err;
11673 res = PyDict_SetItem(new, key, value);
11674 Py_DECREF(key);
11675 Py_DECREF(value);
11676 if (res < 0)
11677 goto err;
11678 }
11679 /* create entries for deleting chars in z */
11680 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681 z_kind = PyUnicode_KIND(z);
11682 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011683 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011685 if (!key)
11686 goto err;
11687 res = PyDict_SetItem(new, key, Py_None);
11688 Py_DECREF(key);
11689 if (res < 0)
11690 goto err;
11691 }
11692 }
11693 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 int kind;
11695 void *data;
11696
Georg Brandlceee0772007-11-27 23:48:05 +000011697 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011698 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011699 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11700 "to maketrans it must be a dict");
11701 goto err;
11702 }
11703 /* copy entries into the new dict, converting string keys to int keys */
11704 while (PyDict_Next(x, &i, &key, &value)) {
11705 if (PyUnicode_Check(key)) {
11706 /* convert string keys to integer keys */
11707 PyObject *newkey;
11708 if (PyUnicode_GET_SIZE(key) != 1) {
11709 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11710 "table must be of length 1");
11711 goto err;
11712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 kind = PyUnicode_KIND(key);
11714 data = PyUnicode_DATA(key);
11715 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011716 if (!newkey)
11717 goto err;
11718 res = PyDict_SetItem(new, newkey, value);
11719 Py_DECREF(newkey);
11720 if (res < 0)
11721 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011722 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011723 /* just keep integer keys */
11724 if (PyDict_SetItem(new, key, value) < 0)
11725 goto err;
11726 } else {
11727 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11728 "be strings or integers");
11729 goto err;
11730 }
11731 }
11732 }
11733 return new;
11734 err:
11735 Py_DECREF(new);
11736 return NULL;
11737}
11738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011739PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741\n\
11742Return a copy of the string S, where all characters have been mapped\n\
11743through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011744Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011745Unmapped characters are left untouched. Characters mapped to None\n\
11746are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
11748static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752}
11753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011754PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
11759static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011760unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 return fixup(self, fixupper);
11763}
11764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011768Pad a numeric string S with zeros on the left, to fill a field\n\
11769of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
11771static PyObject *
11772unicode_zfill(PyUnicodeObject *self, PyObject *args)
11773{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011774 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011776 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 int kind;
11778 void *data;
11779 Py_UCS4 chr;
11780
11781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783
Martin v. Löwis18e16552006-02-15 17:27:45 +000011784 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 return NULL;
11786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011788 if (PyUnicode_CheckExact(self)) {
11789 Py_INCREF(self);
11790 return (PyObject*) self;
11791 }
11792 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011793 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 }
11795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
11798 u = pad(self, fill, 0, '0');
11799
Walter Dörwald068325e2002-04-15 13:36:47 +000011800 if (u == NULL)
11801 return NULL;
11802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 kind = PyUnicode_KIND(u);
11804 data = PyUnicode_DATA(u);
11805 chr = PyUnicode_READ(kind, data, fill);
11806
11807 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 PyUnicode_WRITE(kind, data, 0, chr);
11810 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 }
11812
11813 return (PyObject*) u;
11814}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
11816#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011817static PyObject *
11818unicode__decimal2ascii(PyObject *self)
11819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011821}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822#endif
11823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011824PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011827Return True if S starts with the specified prefix, False otherwise.\n\
11828With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011829With optional end, stop comparing S at that position.\n\
11830prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
11832static PyObject *
11833unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011836 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011838 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011839 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011840 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841
Jesus Ceaac451502011-04-20 17:09:23 +020011842 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011844 if (PyTuple_Check(subobj)) {
11845 Py_ssize_t i;
11846 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11847 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011849 if (substring == NULL)
11850 return NULL;
11851 result = tailmatch(self, substring, start, end, -1);
11852 Py_DECREF(substring);
11853 if (result) {
11854 Py_RETURN_TRUE;
11855 }
11856 }
11857 /* nothing matched */
11858 Py_RETURN_FALSE;
11859 }
11860 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011861 if (substring == NULL) {
11862 if (PyErr_ExceptionMatches(PyExc_TypeError))
11863 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11864 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011866 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011867 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011869 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870}
11871
11872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011873PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011874 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011876Return True if S ends with the specified suffix, False otherwise.\n\
11877With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011878With optional end, stop comparing S at that position.\n\
11879suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
11881static PyObject *
11882unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011885 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011887 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011888 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011889 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Jesus Ceaac451502011-04-20 17:09:23 +020011891 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011892 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011893 if (PyTuple_Check(subobj)) {
11894 Py_ssize_t i;
11895 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11896 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011898 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011900 result = tailmatch(self, substring, start, end, +1);
11901 Py_DECREF(substring);
11902 if (result) {
11903 Py_RETURN_TRUE;
11904 }
11905 }
11906 Py_RETURN_FALSE;
11907 }
11908 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011909 if (substring == NULL) {
11910 if (PyErr_ExceptionMatches(PyExc_TypeError))
11911 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11912 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011914 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011915 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011917 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918}
11919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011921
11922PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011924\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011925Return a formatted version of S, using substitutions from args and kwargs.\n\
11926The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011927
Eric Smith27bbca62010-11-04 17:06:58 +000011928PyDoc_STRVAR(format_map__doc__,
11929 "S.format_map(mapping) -> str\n\
11930\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011931Return a formatted version of S, using substitutions from mapping.\n\
11932The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011933
Eric Smith4a7d76d2008-05-30 18:10:19 +000011934static PyObject *
11935unicode__format__(PyObject* self, PyObject* args)
11936{
11937 PyObject *format_spec;
11938
11939 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11940 return NULL;
11941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11943 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011944}
11945
Eric Smith8c663262007-08-25 02:26:07 +000011946PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011948\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011949Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011950
11951static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011952unicode__sizeof__(PyUnicodeObject *v)
11953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 Py_ssize_t size;
11955
11956 /* If it's a compact object, account for base structure +
11957 character data. */
11958 if (PyUnicode_IS_COMPACT_ASCII(v))
11959 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11960 else if (PyUnicode_IS_COMPACT(v))
11961 size = sizeof(PyCompactUnicodeObject) +
11962 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11963 else {
11964 /* If it is a two-block object, account for base object, and
11965 for character block if present. */
11966 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011967 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 size += (PyUnicode_GET_LENGTH(v) + 1) *
11969 PyUnicode_CHARACTER_SIZE(v);
11970 }
11971 /* If the wstr pointer is present, account for it unless it is shared
11972 with the data pointer. Since PyUnicode_DATA will crash if the object
11973 is not ready, check whether it's either not ready (in which case the
11974 data is entirely in wstr) or if the data is not shared. */
11975 if (_PyUnicode_WSTR(v) &&
11976 (!PyUnicode_IS_READY(v) ||
11977 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v))))
11978 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011979 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011980 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981
11982 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011983}
11984
11985PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011987
11988static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011989unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011990{
Victor Stinner034f6cf2011-09-30 02:26:44 +020011991 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 if (!copy)
11993 return NULL;
11994 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011995}
11996
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997static PyMethodDef unicode_methods[] = {
11998
11999 /* Order is according to common usage: often used methods should
12000 appear first, since lookup is done sequentially. */
12001
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012002 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012003 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12004 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012005 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012006 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12007 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12008 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12009 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12010 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12011 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12012 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012013 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012014 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12015 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12016 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012017 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012018 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12019 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12020 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012021 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012022 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012023 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012024 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012025 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12026 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12027 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12028 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12029 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12030 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12031 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12032 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12033 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12034 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12035 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12036 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12037 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12038 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012039 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012040 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012041 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012042 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012043 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012044 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012045 {"maketrans", (PyCFunction) unicode_maketrans,
12046 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012047 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012048#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012049 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050#endif
12051
12052#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012053 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012054 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055#endif
12056
Benjamin Peterson14339b62009-01-31 16:36:08 +000012057 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058 {NULL, NULL}
12059};
12060
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012061static PyObject *
12062unicode_mod(PyObject *v, PyObject *w)
12063{
Brian Curtindfc80e32011-08-10 20:28:54 -050012064 if (!PyUnicode_Check(v))
12065 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012066 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012067}
12068
12069static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012070 0, /*nb_add*/
12071 0, /*nb_subtract*/
12072 0, /*nb_multiply*/
12073 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012074};
12075
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012077 (lenfunc) unicode_length, /* sq_length */
12078 PyUnicode_Concat, /* sq_concat */
12079 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12080 (ssizeargfunc) unicode_getitem, /* sq_item */
12081 0, /* sq_slice */
12082 0, /* sq_ass_item */
12083 0, /* sq_ass_slice */
12084 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085};
12086
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012087static PyObject*
12088unicode_subscript(PyUnicodeObject* self, PyObject* item)
12089{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 if (PyUnicode_READY(self) == -1)
12091 return NULL;
12092
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012093 if (PyIndex_Check(item)) {
12094 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012095 if (i == -1 && PyErr_Occurred())
12096 return NULL;
12097 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012099 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012100 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012101 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012103 Py_UNICODE* result_buf;
12104 PyObject* result;
12105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012107 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012108 return NULL;
12109 }
12110
12111 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 return PyUnicode_New(0, 0);
12113 } else if (start == 0 && step == 1 &&
12114 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012115 PyUnicode_CheckExact(self)) {
12116 Py_INCREF(self);
12117 return (PyObject *)self;
12118 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012119 return PyUnicode_Substring((PyObject*)self,
12120 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012121 } else {
12122 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012123 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12124 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 if (result_buf == NULL)
12127 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012128
12129 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12130 result_buf[i] = source_buf[cur];
12131 }
Tim Petersced69f82003-09-16 20:30:58 +000012132
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012133 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012134 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012135 return result;
12136 }
12137 } else {
12138 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12139 return NULL;
12140 }
12141}
12142
12143static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 (lenfunc)unicode_length, /* mp_length */
12145 (binaryfunc)unicode_subscript, /* mp_subscript */
12146 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012147};
12148
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150/* Helpers for PyUnicode_Format() */
12151
12152static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012153getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012155 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 (*p_argidx)++;
12158 if (arglen < 0)
12159 return args;
12160 else
12161 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162 }
12163 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 return NULL;
12166}
12167
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012168/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012170static PyObject *
12171formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012173 char *p;
12174 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012176
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 x = PyFloat_AsDouble(v);
12178 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012179 return NULL;
12180
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012182 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012183
Eric Smith0923d1d2009-04-16 20:16:10 +000012184 p = PyOS_double_to_string(x, type, prec,
12185 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012186 if (p == NULL)
12187 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012189 PyMem_Free(p);
12190 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191}
12192
Tim Peters38fd5b62000-09-21 05:43:11 +000012193static PyObject*
12194formatlong(PyObject *val, int flags, int prec, int type)
12195{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012196 char *buf;
12197 int len;
12198 PyObject *str; /* temporary string object. */
12199 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012200
Benjamin Peterson14339b62009-01-31 16:36:08 +000012201 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12202 if (!str)
12203 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012205 Py_DECREF(str);
12206 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012207}
12208
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012211 size_t buflen,
12212 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012214 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012215 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (PyUnicode_GET_LENGTH(v) == 1) {
12217 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 buf[1] = '\0';
12219 return 1;
12220 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 goto onError;
12222 }
12223 else {
12224 /* Integer input truncated to a character */
12225 long x;
12226 x = PyLong_AsLong(v);
12227 if (x == -1 && PyErr_Occurred())
12228 goto onError;
12229
12230 if (x < 0 || x > 0x10ffff) {
12231 PyErr_SetString(PyExc_OverflowError,
12232 "%c arg not in range(0x110000)");
12233 return -1;
12234 }
12235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012237 buf[1] = '\0';
12238 return 1;
12239 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012240
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012242 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012244 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245}
12246
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012247/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012248 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012249*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012250#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012251
Alexander Belopolsky40018472011-02-26 01:02:56 +000012252PyObject *
12253PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 void *fmt;
12256 int fmtkind;
12257 PyObject *result;
12258 Py_UCS4 *res, *res0;
12259 Py_UCS4 max;
12260 int kind;
12261 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012265
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012267 PyErr_BadInternalCall();
12268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12271 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012272 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 fmt = PyUnicode_DATA(uformat);
12274 fmtkind = PyUnicode_KIND(uformat);
12275 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12276 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
12278 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12280 if (res0 == NULL) {
12281 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012282 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284
12285 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 arglen = PyTuple_Size(args);
12287 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 }
12289 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012290 arglen = -1;
12291 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012293 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012294 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
12297 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 if (--rescnt < 0) {
12300 rescnt = fmtcnt + 100;
12301 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12303 if (res0 == NULL){
12304 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 }
12307 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012311 }
12312 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 /* Got a format specifier */
12314 int flags = 0;
12315 Py_ssize_t width = -1;
12316 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 Py_UCS4 c = '\0';
12318 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 int isnumok;
12320 PyObject *v = NULL;
12321 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 void *pbuf;
12323 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 Py_ssize_t len, len1;
12326 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 fmtpos++;
12329 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12330 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 Py_ssize_t keylen;
12332 PyObject *key;
12333 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012334
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 if (dict == NULL) {
12336 PyErr_SetString(PyExc_TypeError,
12337 "format requires a mapping");
12338 goto onError;
12339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 /* Skip over balanced parentheses */
12344 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012348 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 if (fmtcnt < 0 || pcount > 0) {
12353 PyErr_SetString(PyExc_ValueError,
12354 "incomplete format key");
12355 goto onError;
12356 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012357 key = PyUnicode_Substring((PyObject*)uformat,
12358 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 if (key == NULL)
12360 goto onError;
12361 if (args_owned) {
12362 Py_DECREF(args);
12363 args_owned = 0;
12364 }
12365 args = PyObject_GetItem(dict, key);
12366 Py_DECREF(key);
12367 if (args == NULL) {
12368 goto onError;
12369 }
12370 args_owned = 1;
12371 arglen = -1;
12372 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012373 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 case '-': flags |= F_LJUST; continue;
12377 case '+': flags |= F_SIGN; continue;
12378 case ' ': flags |= F_BLANK; continue;
12379 case '#': flags |= F_ALT; continue;
12380 case '0': flags |= F_ZERO; continue;
12381 }
12382 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 if (c == '*') {
12385 v = getnextarg(args, arglen, &argidx);
12386 if (v == NULL)
12387 goto onError;
12388 if (!PyLong_Check(v)) {
12389 PyErr_SetString(PyExc_TypeError,
12390 "* wants int");
12391 goto onError;
12392 }
12393 width = PyLong_AsLong(v);
12394 if (width == -1 && PyErr_Occurred())
12395 goto onError;
12396 if (width < 0) {
12397 flags |= F_LJUST;
12398 width = -width;
12399 }
12400 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 }
12403 else if (c >= '0' && c <= '9') {
12404 width = c - '0';
12405 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012406 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 if (c < '0' || c > '9')
12408 break;
12409 if ((width*10) / 10 != width) {
12410 PyErr_SetString(PyExc_ValueError,
12411 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 }
12414 width = width*10 + (c - '0');
12415 }
12416 }
12417 if (c == '.') {
12418 prec = 0;
12419 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 if (c == '*') {
12422 v = getnextarg(args, arglen, &argidx);
12423 if (v == NULL)
12424 goto onError;
12425 if (!PyLong_Check(v)) {
12426 PyErr_SetString(PyExc_TypeError,
12427 "* wants int");
12428 goto onError;
12429 }
12430 prec = PyLong_AsLong(v);
12431 if (prec == -1 && PyErr_Occurred())
12432 goto onError;
12433 if (prec < 0)
12434 prec = 0;
12435 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 }
12438 else if (c >= '0' && c <= '9') {
12439 prec = c - '0';
12440 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 if (c < '0' || c > '9')
12443 break;
12444 if ((prec*10) / 10 != prec) {
12445 PyErr_SetString(PyExc_ValueError,
12446 "prec too big");
12447 goto onError;
12448 }
12449 prec = prec*10 + (c - '0');
12450 }
12451 }
12452 } /* prec */
12453 if (fmtcnt >= 0) {
12454 if (c == 'h' || c == 'l' || c == 'L') {
12455 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012457 }
12458 }
12459 if (fmtcnt < 0) {
12460 PyErr_SetString(PyExc_ValueError,
12461 "incomplete format");
12462 goto onError;
12463 }
12464 if (c != '%') {
12465 v = getnextarg(args, arglen, &argidx);
12466 if (v == NULL)
12467 goto onError;
12468 }
12469 sign = 0;
12470 fill = ' ';
12471 switch (c) {
12472
12473 case '%':
12474 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 len = 1;
12479 break;
12480
12481 case 's':
12482 case 'r':
12483 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012484 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 temp = v;
12486 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012487 }
12488 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 if (c == 's')
12490 temp = PyObject_Str(v);
12491 else if (c == 'r')
12492 temp = PyObject_Repr(v);
12493 else
12494 temp = PyObject_ASCII(v);
12495 if (temp == NULL)
12496 goto onError;
12497 if (PyUnicode_Check(temp))
12498 /* nothing to do */;
12499 else {
12500 Py_DECREF(temp);
12501 PyErr_SetString(PyExc_TypeError,
12502 "%s argument has non-string str()");
12503 goto onError;
12504 }
12505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 if (PyUnicode_READY(temp) == -1) {
12507 Py_CLEAR(temp);
12508 goto onError;
12509 }
12510 pbuf = PyUnicode_DATA(temp);
12511 kind = PyUnicode_KIND(temp);
12512 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 if (prec >= 0 && len > prec)
12514 len = prec;
12515 break;
12516
12517 case 'i':
12518 case 'd':
12519 case 'u':
12520 case 'o':
12521 case 'x':
12522 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012523 isnumok = 0;
12524 if (PyNumber_Check(v)) {
12525 PyObject *iobj=NULL;
12526
12527 if (PyLong_Check(v)) {
12528 iobj = v;
12529 Py_INCREF(iobj);
12530 }
12531 else {
12532 iobj = PyNumber_Long(v);
12533 }
12534 if (iobj!=NULL) {
12535 if (PyLong_Check(iobj)) {
12536 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012537 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 Py_DECREF(iobj);
12539 if (!temp)
12540 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 if (PyUnicode_READY(temp) == -1) {
12542 Py_CLEAR(temp);
12543 goto onError;
12544 }
12545 pbuf = PyUnicode_DATA(temp);
12546 kind = PyUnicode_KIND(temp);
12547 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 sign = 1;
12549 }
12550 else {
12551 Py_DECREF(iobj);
12552 }
12553 }
12554 }
12555 if (!isnumok) {
12556 PyErr_Format(PyExc_TypeError,
12557 "%%%c format: a number is required, "
12558 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12559 goto onError;
12560 }
12561 if (flags & F_ZERO)
12562 fill = '0';
12563 break;
12564
12565 case 'e':
12566 case 'E':
12567 case 'f':
12568 case 'F':
12569 case 'g':
12570 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012571 temp = formatfloat(v, flags, prec, c);
12572 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 if (PyUnicode_READY(temp) == -1) {
12575 Py_CLEAR(temp);
12576 goto onError;
12577 }
12578 pbuf = PyUnicode_DATA(temp);
12579 kind = PyUnicode_KIND(temp);
12580 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 sign = 1;
12582 if (flags & F_ZERO)
12583 fill = '0';
12584 break;
12585
12586 case 'c':
12587 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012589 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 if (len < 0)
12591 goto onError;
12592 break;
12593
12594 default:
12595 PyErr_Format(PyExc_ValueError,
12596 "unsupported format character '%c' (0x%x) "
12597 "at index %zd",
12598 (31<=c && c<=126) ? (char)c : '?',
12599 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 goto onError;
12602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 /* pbuf is initialized here. */
12604 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12607 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12608 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 len--;
12610 }
12611 else if (flags & F_SIGN)
12612 sign = '+';
12613 else if (flags & F_BLANK)
12614 sign = ' ';
12615 else
12616 sign = 0;
12617 }
12618 if (width < len)
12619 width = len;
12620 if (rescnt - (sign != 0) < width) {
12621 reslen -= rescnt;
12622 rescnt = width + fmtcnt + 100;
12623 reslen += rescnt;
12624 if (reslen < 0) {
12625 Py_XDECREF(temp);
12626 PyErr_NoMemory();
12627 goto onError;
12628 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12630 if (res0 == 0) {
12631 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 Py_XDECREF(temp);
12633 goto onError;
12634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 }
12637 if (sign) {
12638 if (fill != ' ')
12639 *res++ = sign;
12640 rescnt--;
12641 if (width > len)
12642 width--;
12643 }
12644 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12646 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012647 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12649 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 }
12651 rescnt -= 2;
12652 width -= 2;
12653 if (width < 0)
12654 width = 0;
12655 len -= 2;
12656 }
12657 if (width > len && !(flags & F_LJUST)) {
12658 do {
12659 --rescnt;
12660 *res++ = fill;
12661 } while (--width > len);
12662 }
12663 if (fill == ' ') {
12664 if (sign)
12665 *res++ = sign;
12666 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12668 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12669 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12670 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012671 }
12672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 /* Copy all characters, preserving len */
12674 len1 = len;
12675 while (len1--) {
12676 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12677 rescnt--;
12678 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 while (--width >= len) {
12680 --rescnt;
12681 *res++ = ' ';
12682 }
12683 if (dict && (argidx < arglen) && c != '%') {
12684 PyErr_SetString(PyExc_TypeError,
12685 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012686 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 goto onError;
12688 }
12689 Py_XDECREF(temp);
12690 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691 } /* until end */
12692 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 PyErr_SetString(PyExc_TypeError,
12694 "not all arguments converted during string formatting");
12695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696 }
12697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698
12699 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12700 if (*res > max)
12701 max = *res;
12702 result = PyUnicode_New(reslen - rescnt, max);
12703 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 kind = PyUnicode_KIND(result);
12706 for (res = res0; res < res0+reslen-rescnt; res++)
12707 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12708 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012710 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711 }
12712 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713 return (PyObject *)result;
12714
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717 Py_DECREF(uformat);
12718 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 }
12721 return NULL;
12722}
12723
Jeremy Hylton938ace62002-07-17 16:30:39 +000012724static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012725unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12726
Tim Peters6d6c1a32001-08-02 04:15:00 +000012727static PyObject *
12728unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12729{
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012731 static char *kwlist[] = {"object", "encoding", "errors", 0};
12732 char *encoding = NULL;
12733 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012734
Benjamin Peterson14339b62009-01-31 16:36:08 +000012735 if (type != &PyUnicode_Type)
12736 return unicode_subtype_new(type, args, kwds);
12737 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012739 return NULL;
12740 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012742 if (encoding == NULL && errors == NULL)
12743 return PyObject_Str(x);
12744 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012746}
12747
Guido van Rossume023fe02001-08-30 03:12:59 +000012748static PyObject *
12749unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12750{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012751 PyUnicodeObject *unicode, *self;
12752 Py_ssize_t length, char_size;
12753 int share_wstr, share_utf8;
12754 unsigned int kind;
12755 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012756
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012758
12759 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12760 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012762 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012763 if (PyUnicode_READY(unicode))
12764 return NULL;
12765
12766 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12767 if (self == NULL) {
12768 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 return NULL;
12770 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012771 kind = PyUnicode_KIND(unicode);
12772 length = PyUnicode_GET_LENGTH(unicode);
12773
12774 _PyUnicode_LENGTH(self) = length;
12775 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12776 _PyUnicode_STATE(self).interned = 0;
12777 _PyUnicode_STATE(self).kind = kind;
12778 _PyUnicode_STATE(self).compact = 0;
12779 _PyUnicode_STATE(self).ascii = 0;
12780 _PyUnicode_STATE(self).ready = 1;
12781 _PyUnicode_WSTR(self) = NULL;
12782 _PyUnicode_UTF8_LENGTH(self) = 0;
12783 _PyUnicode_UTF8(self) = NULL;
12784 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012785 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012786
12787 share_utf8 = 0;
12788 share_wstr = 0;
12789 if (kind == PyUnicode_1BYTE_KIND) {
12790 char_size = 1;
12791 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12792 share_utf8 = 1;
12793 }
12794 else if (kind == PyUnicode_2BYTE_KIND) {
12795 char_size = 2;
12796 if (sizeof(wchar_t) == 2)
12797 share_wstr = 1;
12798 }
12799 else {
12800 assert(kind == PyUnicode_4BYTE_KIND);
12801 char_size = 4;
12802 if (sizeof(wchar_t) == 4)
12803 share_wstr = 1;
12804 }
12805
12806 /* Ensure we won't overflow the length. */
12807 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12808 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012810 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012811 data = PyObject_MALLOC((length + 1) * char_size);
12812 if (data == NULL) {
12813 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 goto onError;
12815 }
12816
Victor Stinnerc3c74152011-10-02 20:39:55 +020012817 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012818 if (share_utf8) {
12819 _PyUnicode_UTF8_LENGTH(self) = length;
12820 _PyUnicode_UTF8(self) = data;
12821 }
12822 if (share_wstr) {
12823 _PyUnicode_WSTR_LENGTH(self) = length;
12824 _PyUnicode_WSTR(self) = (wchar_t *)data;
12825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012827 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12828 PyUnicode_KIND_SIZE(kind, length + 1));
12829 Py_DECREF(unicode);
12830 return (PyObject *)self;
12831
12832onError:
12833 Py_DECREF(unicode);
12834 Py_DECREF(self);
12835 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012836}
12837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012838PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012840\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012841Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012842encoding defaults to the current default string encoding.\n\
12843errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012844
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012845static PyObject *unicode_iter(PyObject *seq);
12846
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012848 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012849 "str", /* tp_name */
12850 sizeof(PyUnicodeObject), /* tp_size */
12851 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 (destructor)unicode_dealloc, /* tp_dealloc */
12854 0, /* tp_print */
12855 0, /* tp_getattr */
12856 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012857 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 unicode_repr, /* tp_repr */
12859 &unicode_as_number, /* tp_as_number */
12860 &unicode_as_sequence, /* tp_as_sequence */
12861 &unicode_as_mapping, /* tp_as_mapping */
12862 (hashfunc) unicode_hash, /* tp_hash*/
12863 0, /* tp_call*/
12864 (reprfunc) unicode_str, /* tp_str */
12865 PyObject_GenericGetAttr, /* tp_getattro */
12866 0, /* tp_setattro */
12867 0, /* tp_as_buffer */
12868 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012870 unicode_doc, /* tp_doc */
12871 0, /* tp_traverse */
12872 0, /* tp_clear */
12873 PyUnicode_RichCompare, /* tp_richcompare */
12874 0, /* tp_weaklistoffset */
12875 unicode_iter, /* tp_iter */
12876 0, /* tp_iternext */
12877 unicode_methods, /* tp_methods */
12878 0, /* tp_members */
12879 0, /* tp_getset */
12880 &PyBaseObject_Type, /* tp_base */
12881 0, /* tp_dict */
12882 0, /* tp_descr_get */
12883 0, /* tp_descr_set */
12884 0, /* tp_dictoffset */
12885 0, /* tp_init */
12886 0, /* tp_alloc */
12887 unicode_new, /* tp_new */
12888 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889};
12890
12891/* Initialize the Unicode implementation */
12892
Thomas Wouters78890102000-07-22 19:25:51 +000012893void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012895 int i;
12896
Thomas Wouters477c8d52006-05-27 19:21:47 +000012897 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012899 0x000A, /* LINE FEED */
12900 0x000D, /* CARRIAGE RETURN */
12901 0x001C, /* FILE SEPARATOR */
12902 0x001D, /* GROUP SEPARATOR */
12903 0x001E, /* RECORD SEPARATOR */
12904 0x0085, /* NEXT LINE */
12905 0x2028, /* LINE SEPARATOR */
12906 0x2029, /* PARAGRAPH SEPARATOR */
12907 };
12908
Fred Drakee4315f52000-05-09 19:53:39 +000012909 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012910 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012911 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012913
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012914 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012916 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012917 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012918
12919 /* initialize the linebreak bloom filter */
12920 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012922 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012923
12924 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925}
12926
12927/* Finalize the Unicode implementation */
12928
Christian Heimesa156e092008-02-16 07:38:31 +000012929int
12930PyUnicode_ClearFreeList(void)
12931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012933}
12934
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935void
Thomas Wouters78890102000-07-22 19:25:51 +000012936_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012938 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012940 Py_XDECREF(unicode_empty);
12941 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012942
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012943 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 if (unicode_latin1[i]) {
12945 Py_DECREF(unicode_latin1[i]);
12946 unicode_latin1[i] = NULL;
12947 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012948 }
Christian Heimesa156e092008-02-16 07:38:31 +000012949 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012951
Walter Dörwald16807132007-05-25 13:52:07 +000012952void
12953PyUnicode_InternInPlace(PyObject **p)
12954{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012955 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12956 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012957#ifdef Py_DEBUG
12958 assert(s != NULL);
12959 assert(_PyUnicode_CHECK(s));
12960#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012961 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012962 return;
12963#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 /* If it's a subclass, we don't really know what putting
12965 it in the interned dict might do. */
12966 if (!PyUnicode_CheckExact(s))
12967 return;
12968 if (PyUnicode_CHECK_INTERNED(s))
12969 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012971 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 return;
12973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012974 if (interned == NULL) {
12975 interned = PyDict_New();
12976 if (interned == NULL) {
12977 PyErr_Clear(); /* Don't leave an exception */
12978 return;
12979 }
12980 }
12981 /* It might be that the GetItem call fails even
12982 though the key is present in the dictionary,
12983 namely when this happens during a stack overflow. */
12984 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012986 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012987
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 if (t) {
12989 Py_INCREF(t);
12990 Py_DECREF(*p);
12991 *p = t;
12992 return;
12993 }
Walter Dörwald16807132007-05-25 13:52:07 +000012994
Benjamin Peterson14339b62009-01-31 16:36:08 +000012995 PyThreadState_GET()->recursion_critical = 1;
12996 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
12997 PyErr_Clear();
12998 PyThreadState_GET()->recursion_critical = 0;
12999 return;
13000 }
13001 PyThreadState_GET()->recursion_critical = 0;
13002 /* The two references in interned are not counted by refcnt.
13003 The deallocator will take care of this */
13004 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013006}
13007
13008void
13009PyUnicode_InternImmortal(PyObject **p)
13010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13012
Benjamin Peterson14339b62009-01-31 16:36:08 +000013013 PyUnicode_InternInPlace(p);
13014 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 Py_INCREF(*p);
13017 }
Walter Dörwald16807132007-05-25 13:52:07 +000013018}
13019
13020PyObject *
13021PyUnicode_InternFromString(const char *cp)
13022{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013023 PyObject *s = PyUnicode_FromString(cp);
13024 if (s == NULL)
13025 return NULL;
13026 PyUnicode_InternInPlace(&s);
13027 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013028}
13029
Alexander Belopolsky40018472011-02-26 01:02:56 +000013030void
13031_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013032{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033 PyObject *keys;
13034 PyUnicodeObject *s;
13035 Py_ssize_t i, n;
13036 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013037
Benjamin Peterson14339b62009-01-31 16:36:08 +000013038 if (interned == NULL || !PyDict_Check(interned))
13039 return;
13040 keys = PyDict_Keys(interned);
13041 if (keys == NULL || !PyList_Check(keys)) {
13042 PyErr_Clear();
13043 return;
13044 }
Walter Dörwald16807132007-05-25 13:52:07 +000013045
Benjamin Peterson14339b62009-01-31 16:36:08 +000013046 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13047 detector, interned unicode strings are not forcibly deallocated;
13048 rather, we give them their stolen references back, and then clear
13049 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013050
Benjamin Peterson14339b62009-01-31 16:36:08 +000013051 n = PyList_GET_SIZE(keys);
13052 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013053 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013054 for (i = 0; i < n; i++) {
13055 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 if (PyUnicode_READY(s) == -1)
13057 fprintf(stderr, "could not ready string\n");
13058 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013059 case SSTATE_NOT_INTERNED:
13060 /* XXX Shouldn't happen */
13061 break;
13062 case SSTATE_INTERNED_IMMORTAL:
13063 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013065 break;
13066 case SSTATE_INTERNED_MORTAL:
13067 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 break;
13070 default:
13071 Py_FatalError("Inconsistent interned string state.");
13072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013074 }
13075 fprintf(stderr, "total size of all interned strings: "
13076 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13077 "mortal/immortal\n", mortal_size, immortal_size);
13078 Py_DECREF(keys);
13079 PyDict_Clear(interned);
13080 Py_DECREF(interned);
13081 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013082}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013083
13084
13085/********************* Unicode Iterator **************************/
13086
13087typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 PyObject_HEAD
13089 Py_ssize_t it_index;
13090 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013091} unicodeiterobject;
13092
13093static void
13094unicodeiter_dealloc(unicodeiterobject *it)
13095{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013096 _PyObject_GC_UNTRACK(it);
13097 Py_XDECREF(it->it_seq);
13098 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013099}
13100
13101static int
13102unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013104 Py_VISIT(it->it_seq);
13105 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013106}
13107
13108static PyObject *
13109unicodeiter_next(unicodeiterobject *it)
13110{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013111 PyUnicodeObject *seq;
13112 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013113
Benjamin Peterson14339b62009-01-31 16:36:08 +000013114 assert(it != NULL);
13115 seq = it->it_seq;
13116 if (seq == NULL)
13117 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013118 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013120 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13121 int kind = PyUnicode_KIND(seq);
13122 void *data = PyUnicode_DATA(seq);
13123 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13124 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013125 if (item != NULL)
13126 ++it->it_index;
13127 return item;
13128 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013129
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 Py_DECREF(seq);
13131 it->it_seq = NULL;
13132 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013133}
13134
13135static PyObject *
13136unicodeiter_len(unicodeiterobject *it)
13137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 Py_ssize_t len = 0;
13139 if (it->it_seq)
13140 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13141 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013142}
13143
13144PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13145
13146static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013147 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013150};
13151
13152PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013153 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13154 "str_iterator", /* tp_name */
13155 sizeof(unicodeiterobject), /* tp_basicsize */
13156 0, /* tp_itemsize */
13157 /* methods */
13158 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13159 0, /* tp_print */
13160 0, /* tp_getattr */
13161 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013162 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 0, /* tp_repr */
13164 0, /* tp_as_number */
13165 0, /* tp_as_sequence */
13166 0, /* tp_as_mapping */
13167 0, /* tp_hash */
13168 0, /* tp_call */
13169 0, /* tp_str */
13170 PyObject_GenericGetAttr, /* tp_getattro */
13171 0, /* tp_setattro */
13172 0, /* tp_as_buffer */
13173 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13174 0, /* tp_doc */
13175 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13176 0, /* tp_clear */
13177 0, /* tp_richcompare */
13178 0, /* tp_weaklistoffset */
13179 PyObject_SelfIter, /* tp_iter */
13180 (iternextfunc)unicodeiter_next, /* tp_iternext */
13181 unicodeiter_methods, /* tp_methods */
13182 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013183};
13184
13185static PyObject *
13186unicode_iter(PyObject *seq)
13187{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013188 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013189
Benjamin Peterson14339b62009-01-31 16:36:08 +000013190 if (!PyUnicode_Check(seq)) {
13191 PyErr_BadInternalCall();
13192 return NULL;
13193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 if (PyUnicode_READY(seq) == -1)
13195 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013196 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13197 if (it == NULL)
13198 return NULL;
13199 it->it_index = 0;
13200 Py_INCREF(seq);
13201 it->it_seq = (PyUnicodeObject *)seq;
13202 _PyObject_GC_TRACK(it);
13203 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013204}
13205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206#define UNIOP(x) Py_UNICODE_##x
13207#define UNIOP_t Py_UNICODE
13208#include "uniops.h"
13209#undef UNIOP
13210#undef UNIOP_t
13211#define UNIOP(x) Py_UCS4_##x
13212#define UNIOP_t Py_UCS4
13213#include "uniops.h"
13214#undef UNIOP
13215#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013216
Victor Stinner71133ff2010-09-01 23:43:53 +000013217Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013218PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013219{
13220 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13221 Py_UNICODE *copy;
13222 Py_ssize_t size;
13223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013224 if (!PyUnicode_Check(unicode)) {
13225 PyErr_BadArgument();
13226 return NULL;
13227 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013228 /* Ensure we won't overflow the size. */
13229 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13230 PyErr_NoMemory();
13231 return NULL;
13232 }
13233 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13234 size *= sizeof(Py_UNICODE);
13235 copy = PyMem_Malloc(size);
13236 if (copy == NULL) {
13237 PyErr_NoMemory();
13238 return NULL;
13239 }
13240 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13241 return copy;
13242}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013243
Georg Brandl66c221e2010-10-14 07:04:07 +000013244/* A _string module, to export formatter_parser and formatter_field_name_split
13245 to the string.Formatter class implemented in Python. */
13246
13247static PyMethodDef _string_methods[] = {
13248 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13249 METH_O, PyDoc_STR("split the argument as a field name")},
13250 {"formatter_parser", (PyCFunction) formatter_parser,
13251 METH_O, PyDoc_STR("parse the argument as a format string")},
13252 {NULL, NULL}
13253};
13254
13255static struct PyModuleDef _string_module = {
13256 PyModuleDef_HEAD_INIT,
13257 "_string",
13258 PyDoc_STR("string helper module"),
13259 0,
13260 _string_methods,
13261 NULL,
13262 NULL,
13263 NULL,
13264 NULL
13265};
13266
13267PyMODINIT_FUNC
13268PyInit__string(void)
13269{
13270 return PyModule_Create(&_string_module);
13271}
13272
13273
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013274#ifdef __cplusplus
13275}
13276#endif