blob: cc6b41697d5c6025bc5b781c009729d9c7d738b6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner910337b2011-10-03 03:20:16 +0200154/* Generic helper macro to convert characters of different types.
155 from_type and to_type have to be valid type names, begin and end
156 are pointers to the source characters which should be of type
157 "from_type *". to is a pointer of type "to_type *" and points to the
158 buffer where the result characters are written to. */
159#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
160 do { \
161 const from_type *iter_; to_type *to_; \
162 for (iter_ = (begin), to_ = (to_type *)(to); \
163 iter_ < (end); \
164 ++iter_, ++to_) { \
165 *to_ = (to_type)*iter_; \
166 } \
167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200169/* The Unicode string has been modified: reset the hash */
170#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
171
Walter Dörwald16807132007-05-25 13:52:07 +0000172/* This dictionary holds all interned unicode strings. Note that references
173 to strings in this dictionary are *not* counted in the string's ob_refcnt.
174 When the interned string reaches a refcnt of 0 the string deallocation
175 function will delete the reference from this dictionary.
176
177 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000178 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000179*/
180static PyObject *interned;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200183static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184
185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223
Alexander Belopolsky40018472011-02-26 01:02:56 +0000224static PyObject *
225unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000226 PyObject **errorHandler,const char *encoding, const char *reason,
227 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
228 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static void
231raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300232 const char *encoding,
233 const Py_UNICODE *unicode, Py_ssize_t size,
234 Py_ssize_t startpos, Py_ssize_t endpos,
235 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000236
Christian Heimes190d79e2008-01-30 11:58:22 +0000237/* Same for linebreaks */
238static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000240/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000241/* 0x000B, * LINE TABULATION */
242/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000243/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000244 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x001C, * FILE SEPARATOR */
247/* 0x001D, * GROUP SEPARATOR */
248/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000249 0, 0, 0, 0, 1, 1, 1, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000254
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000263};
264
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300265/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
266 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000267Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000268PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000269{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000270#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000272#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 /* This is actually an illegal character, so it should
274 not be passed to unichr. */
275 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000276#endif
277}
278
Victor Stinner910337b2011-10-03 03:20:16 +0200279#ifdef Py_DEBUG
280static int
281_PyUnicode_CheckConsistency(void *op)
282{
283 PyASCIIObject *ascii;
284 unsigned int kind;
285
286 assert(PyUnicode_Check(op));
287
288 ascii = (PyASCIIObject *)op;
289 kind = ascii->state.kind;
290
Victor Stinnera3b334d2011-10-03 13:53:37 +0200291 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200292 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200293 assert(ascii->state.ready == 1);
294 }
295 else if (ascii->state.compact == 1) {
296 assert(kind == PyUnicode_1BYTE_KIND
297 || kind == PyUnicode_2BYTE_KIND
298 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ascii == 0);
300 assert(ascii->state.ready == 1);
301 } else {
302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
303 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
304
305 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ascii == 0);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200308 assert(ascii->state.ready == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200309 assert(ascii->wstr != NULL);
310 assert(unicode->data.any == NULL);
311 assert(compact->utf8 == NULL);
312 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
313 }
314 else {
315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
320 assert(unicode->data.any != NULL);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 }
322 }
323 return 1;
324}
325#endif
326
Thomas Wouters477c8d52006-05-27 19:21:47 +0000327/* --- Bloom Filters ----------------------------------------------------- */
328
329/* stuff to implement simple "bloom filters" for Unicode characters.
330 to keep things simple, we use a single bitmask, using the least 5
331 bits from each unicode characters as the bit index. */
332
333/* the linebreak mask is set up by Unicode_Init below */
334
Antoine Pitrouf068f942010-01-13 14:19:12 +0000335#if LONG_BIT >= 128
336#define BLOOM_WIDTH 128
337#elif LONG_BIT >= 64
338#define BLOOM_WIDTH 64
339#elif LONG_BIT >= 32
340#define BLOOM_WIDTH 32
341#else
342#error "LONG_BIT is smaller than 32"
343#endif
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345#define BLOOM_MASK unsigned long
346
347static BLOOM_MASK bloom_linebreak;
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
350#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000351
Benjamin Peterson29060642009-01-31 22:14:21 +0000352#define BLOOM_LINEBREAK(ch) \
353 ((ch) < 128U ? ascii_linebreak[(ch)] : \
354 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
Alexander Belopolsky40018472011-02-26 01:02:56 +0000356Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000358{
359 /* calculate simple bloom-style bitmask for a given unicode string */
360
Antoine Pitrouf068f942010-01-13 14:19:12 +0000361 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362 Py_ssize_t i;
363
364 mask = 0;
365 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000367
368 return mask;
369}
370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371#define BLOOM_MEMBER(mask, chr, str) \
372 (BLOOM(mask, chr) \
373 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000374
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375/* --- Unicode Object ----------------------------------------------------- */
376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200378fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
379
380Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
381 Py_ssize_t size, Py_UCS4 ch,
382 int direction)
383{
384 /* like wcschr, but doesn't stop at NULL characters */
385 Py_ssize_t i;
386 if (direction == 1) {
387 for(i = 0; i < size; i++)
388 if (PyUnicode_READ(kind, s, i) == ch)
389 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
390 }
391 else {
392 for(i = size-1; i >= 0; i--)
393 if (PyUnicode_READ(kind, s, i) == ch)
394 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
395 }
396 return NULL;
397}
398
Victor Stinnerfe226c02011-10-03 03:52:20 +0200399static PyObject*
400resize_compact(PyObject *unicode, Py_ssize_t length)
401{
402 Py_ssize_t char_size;
403 Py_ssize_t struct_size;
404 Py_ssize_t new_size;
405 int share_wstr;
406
407 assert(PyUnicode_IS_READY(unicode));
408 char_size = PyUnicode_CHARACTER_SIZE(unicode);
409 if (PyUnicode_IS_COMPACT_ASCII(unicode))
410 struct_size = sizeof(PyASCIIObject);
411 else
412 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200413 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200414
415 _Py_DEC_REFTOTAL;
416 _Py_ForgetReference(unicode);
417
418 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
419 PyErr_NoMemory();
420 return NULL;
421 }
422 new_size = (struct_size + (length + 1) * char_size);
423
424 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
425 if (unicode == NULL) {
426 PyObject_Del(unicode);
427 PyErr_NoMemory();
428 return NULL;
429 }
430 _Py_NewReference(unicode);
431 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200432 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200433 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200434 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
435 _PyUnicode_WSTR_LENGTH(unicode) = length;
436 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
438 length, 0);
439 return unicode;
440}
441
Alexander Belopolsky40018472011-02-26 01:02:56 +0000442static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200443resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444{
445 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200448
Victor Stinnerfe226c02011-10-03 03:52:20 +0200449 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200450 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
453 {
454 PyObject_DEL(_PyUnicode_UTF8(unicode));
455 _PyUnicode_UTF8(unicode) = NULL;
456 }
457
458 if (PyUnicode_IS_READY(unicode)) {
459 Py_ssize_t char_size;
460 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200461 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200462 void *data;
463
464 data = _PyUnicode_DATA_ANY(unicode);
465 assert(data != NULL);
466 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200467 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
468 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200469
470 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
471 PyErr_NoMemory();
472 return -1;
473 }
474 new_size = (length + 1) * char_size;
475
476 data = (PyObject *)PyObject_REALLOC(data, new_size);
477 if (data == NULL) {
478 PyErr_NoMemory();
479 return -1;
480 }
481 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200482 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200484 _PyUnicode_WSTR_LENGTH(unicode) = length;
485 }
486 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200487 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 _PyUnicode_UTF8_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 _PyUnicode_LENGTH(unicode) = length;
491 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
492 if (share_wstr)
493 return 0;
494 }
495 if (_PyUnicode_WSTR(unicode) != NULL) {
496 assert(_PyUnicode_WSTR(unicode) != NULL);
497
498 oldstr = _PyUnicode_WSTR(unicode);
499 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
500 sizeof(Py_UNICODE) * (length + 1));
501 if (!_PyUnicode_WSTR(unicode)) {
502 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
503 PyErr_NoMemory();
504 return -1;
505 }
506 _PyUnicode_WSTR(unicode)[length] = 0;
507 _PyUnicode_WSTR_LENGTH(unicode) = length;
508 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 return 0;
510}
511
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512static PyObject*
513resize_copy(PyObject *unicode, Py_ssize_t length)
514{
515 Py_ssize_t copy_length;
516 if (PyUnicode_IS_COMPACT(unicode)) {
517 PyObject *copy;
518 assert(PyUnicode_IS_READY(unicode));
519
520 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
521 if (copy == NULL)
522 return NULL;
523
524 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
525 if (PyUnicode_CopyCharacters(copy, 0,
526 unicode, 0,
527 copy_length) < 0)
528 {
529 Py_DECREF(copy);
530 return NULL;
531 }
532 return copy;
533 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200534 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 assert(_PyUnicode_WSTR(unicode) != NULL);
536 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200537 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200538 if (w == NULL)
539 return NULL;
540 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
541 copy_length = Py_MIN(copy_length, length);
542 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
543 copy_length);
544 return (PyObject*)w;
545 }
546}
547
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000549 Ux0000 terminated; some code (e.g. new_identifier)
550 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551
552 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000553 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554
555*/
556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557#ifdef Py_DEBUG
558int unicode_old_new_calls = 0;
559#endif
560
Alexander Belopolsky40018472011-02-26 01:02:56 +0000561static PyUnicodeObject *
562_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563{
564 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 if (length == 0 && unicode_empty != NULL) {
569 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200570 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 }
572
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000573 /* Ensure we won't overflow the size. */
574 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
575 return (PyUnicodeObject *)PyErr_NoMemory();
576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577 if (length < 0) {
578 PyErr_SetString(PyExc_SystemError,
579 "Negative size passed to _PyUnicode_New");
580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 }
582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200583#ifdef Py_DEBUG
584 ++unicode_old_new_calls;
585#endif
586
587 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
588 if (unicode == NULL)
589 return NULL;
590 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
591 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
592 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000593 PyErr_NoMemory();
594 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200596
Jeremy Hyltond8082792003-09-16 19:41:39 +0000597 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000598 * the caller fails before initializing str -- unicode_resize()
599 * reads str[0], and the Keep-Alive optimization can keep memory
600 * allocated for str alive across a call to unicode_dealloc(unicode).
601 * We don't want unicode_resize to read uninitialized memory in
602 * that case.
603 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604 _PyUnicode_WSTR(unicode)[0] = 0;
605 _PyUnicode_WSTR(unicode)[length] = 0;
606 _PyUnicode_WSTR_LENGTH(unicode) = length;
607 _PyUnicode_HASH(unicode) = -1;
608 _PyUnicode_STATE(unicode).interned = 0;
609 _PyUnicode_STATE(unicode).kind = 0;
610 _PyUnicode_STATE(unicode).compact = 0;
611 _PyUnicode_STATE(unicode).ready = 0;
612 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200613 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200615 _PyUnicode_UTF8(unicode) = NULL;
616 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000618
Benjamin Peterson29060642009-01-31 22:14:21 +0000619 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000620 /* XXX UNREF/NEWREF interface should be more symmetrical */
621 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000622 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000623 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000625}
626
Victor Stinnerf42dc442011-10-02 23:33:16 +0200627static const char*
628unicode_kind_name(PyObject *unicode)
629{
Victor Stinner910337b2011-10-03 03:20:16 +0200630 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200631 if (!PyUnicode_IS_COMPACT(unicode))
632 {
633 if (!PyUnicode_IS_READY(unicode))
634 return "wstr";
635 switch(PyUnicode_KIND(unicode))
636 {
637 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200638 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200639 return "legacy ascii";
640 else
641 return "legacy latin1";
642 case PyUnicode_2BYTE_KIND:
643 return "legacy UCS2";
644 case PyUnicode_4BYTE_KIND:
645 return "legacy UCS4";
646 default:
647 return "<legacy invalid kind>";
648 }
649 }
650 assert(PyUnicode_IS_READY(unicode));
651 switch(PyUnicode_KIND(unicode))
652 {
653 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200654 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200655 return "ascii";
656 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200657 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200658 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200659 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200660 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200661 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200662 default:
663 return "<invalid compact kind>";
664 }
665}
666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200667#ifdef Py_DEBUG
668int unicode_new_new_calls = 0;
669
670/* Functions wrapping macros for use in debugger */
671char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200672 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200673}
674
675void *_PyUnicode_compact_data(void *unicode) {
676 return _PyUnicode_COMPACT_DATA(unicode);
677}
678void *_PyUnicode_data(void *unicode){
679 printf("obj %p\n", unicode);
680 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
681 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
682 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
683 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
684 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
685 return PyUnicode_DATA(unicode);
686}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200687
688void
689_PyUnicode_Dump(PyObject *op)
690{
691 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200692 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
693 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
694 void *data;
695 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
696 if (ascii->state.compact)
697 data = (compact + 1);
698 else
699 data = unicode->data.any;
700 if (ascii->wstr == data)
701 printf("shared ");
702 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200703 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200704 printf(" (%zu), ", compact->wstr_length);
705 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
706 printf("shared ");
707 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200708 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200709 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200710}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711#endif
712
713PyObject *
714PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
715{
716 PyObject *obj;
717 PyCompactUnicodeObject *unicode;
718 void *data;
719 int kind_state;
720 int is_sharing = 0, is_ascii = 0;
721 Py_ssize_t char_size;
722 Py_ssize_t struct_size;
723
724 /* Optimization for empty strings */
725 if (size == 0 && unicode_empty != NULL) {
726 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200727 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200728 }
729
730#ifdef Py_DEBUG
731 ++unicode_new_new_calls;
732#endif
733
734 struct_size = sizeof(PyCompactUnicodeObject);
735 if (maxchar < 128) {
736 kind_state = PyUnicode_1BYTE_KIND;
737 char_size = 1;
738 is_ascii = 1;
739 struct_size = sizeof(PyASCIIObject);
740 }
741 else if (maxchar < 256) {
742 kind_state = PyUnicode_1BYTE_KIND;
743 char_size = 1;
744 }
745 else if (maxchar < 65536) {
746 kind_state = PyUnicode_2BYTE_KIND;
747 char_size = 2;
748 if (sizeof(wchar_t) == 2)
749 is_sharing = 1;
750 }
751 else {
752 kind_state = PyUnicode_4BYTE_KIND;
753 char_size = 4;
754 if (sizeof(wchar_t) == 4)
755 is_sharing = 1;
756 }
757
758 /* Ensure we won't overflow the size. */
759 if (size < 0) {
760 PyErr_SetString(PyExc_SystemError,
761 "Negative size passed to PyUnicode_New");
762 return NULL;
763 }
764 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
765 return PyErr_NoMemory();
766
767 /* Duplicated allocation code from _PyObject_New() instead of a call to
768 * PyObject_New() so we are able to allocate space for the object and
769 * it's data buffer.
770 */
771 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
772 if (obj == NULL)
773 return PyErr_NoMemory();
774 obj = PyObject_INIT(obj, &PyUnicode_Type);
775 if (obj == NULL)
776 return NULL;
777
778 unicode = (PyCompactUnicodeObject *)obj;
779 if (is_ascii)
780 data = ((PyASCIIObject*)obj) + 1;
781 else
782 data = unicode + 1;
783 _PyUnicode_LENGTH(unicode) = size;
784 _PyUnicode_HASH(unicode) = -1;
785 _PyUnicode_STATE(unicode).interned = 0;
786 _PyUnicode_STATE(unicode).kind = kind_state;
787 _PyUnicode_STATE(unicode).compact = 1;
788 _PyUnicode_STATE(unicode).ready = 1;
789 _PyUnicode_STATE(unicode).ascii = is_ascii;
790 if (is_ascii) {
791 ((char*)data)[size] = 0;
792 _PyUnicode_WSTR(unicode) = NULL;
793 }
794 else if (kind_state == PyUnicode_1BYTE_KIND) {
795 ((char*)data)[size] = 0;
796 _PyUnicode_WSTR(unicode) = NULL;
797 _PyUnicode_WSTR_LENGTH(unicode) = 0;
798 unicode->utf8_length = 0;
799 unicode->utf8 = NULL;
800 }
801 else {
802 unicode->utf8 = NULL;
803 if (kind_state == PyUnicode_2BYTE_KIND)
804 ((Py_UCS2*)data)[size] = 0;
805 else /* kind_state == PyUnicode_4BYTE_KIND */
806 ((Py_UCS4*)data)[size] = 0;
807 if (is_sharing) {
808 _PyUnicode_WSTR_LENGTH(unicode) = size;
809 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
810 }
811 else {
812 _PyUnicode_WSTR_LENGTH(unicode) = 0;
813 _PyUnicode_WSTR(unicode) = NULL;
814 }
815 }
816 return obj;
817}
818
819#if SIZEOF_WCHAR_T == 2
820/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
821 will decode surrogate pairs, the other conversions are implemented as macros
822 for efficency.
823
824 This function assumes that unicode can hold one more code point than wstr
825 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200826static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
828 PyUnicodeObject *unicode)
829{
830 const wchar_t *iter;
831 Py_UCS4 *ucs4_out;
832
Victor Stinner910337b2011-10-03 03:20:16 +0200833 assert(unicode != NULL);
834 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
836 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
837
838 for (iter = begin; iter < end; ) {
839 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
840 _PyUnicode_GET_LENGTH(unicode)));
841 if (*iter >= 0xD800 && *iter <= 0xDBFF
842 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
843 {
844 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
845 iter += 2;
846 }
847 else {
848 *ucs4_out++ = *iter;
849 iter++;
850 }
851 }
852 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
853 _PyUnicode_GET_LENGTH(unicode)));
854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855}
856#endif
857
Victor Stinnercd9950f2011-10-02 00:34:53 +0200858static int
859_PyUnicode_Dirty(PyObject *unicode)
860{
Victor Stinner910337b2011-10-03 03:20:16 +0200861 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200862 if (Py_REFCNT(unicode) != 1) {
863 PyErr_SetString(PyExc_ValueError,
864 "Cannot modify a string having more than 1 reference");
865 return -1;
866 }
867 _PyUnicode_DIRTY(unicode);
868 return 0;
869}
870
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200871Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200872PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
873 PyObject *from, Py_ssize_t from_start,
874 Py_ssize_t how_many)
875{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200876 unsigned int from_kind, to_kind;
877 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878
Victor Stinnerb1536152011-09-30 02:26:10 +0200879 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
880 PyErr_BadInternalCall();
881 return -1;
882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883
884 if (PyUnicode_READY(from))
885 return -1;
886 if (PyUnicode_READY(to))
887 return -1;
888
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200889 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200890 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
891 PyErr_Format(PyExc_ValueError,
892 "Cannot write %zi characters at %zi "
893 "in a string of %zi characters",
894 how_many, to_start, PyUnicode_GET_LENGTH(to));
895 return -1;
896 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200897 if (how_many == 0)
898 return 0;
899
Victor Stinnercd9950f2011-10-02 00:34:53 +0200900 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200901 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200904 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200906 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 if (from_kind == to_kind
909 /* deny latin1 => ascii */
910 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
911 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200912 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200913 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200914 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200915 + PyUnicode_KIND_SIZE(from_kind, from_start),
916 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200918 else if (from_kind == PyUnicode_1BYTE_KIND
919 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200920 {
921 _PyUnicode_CONVERT_BYTES(
922 Py_UCS1, Py_UCS2,
923 PyUnicode_1BYTE_DATA(from) + from_start,
924 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
925 PyUnicode_2BYTE_DATA(to) + to_start
926 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200927 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200928 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200929 && to_kind == PyUnicode_4BYTE_KIND)
930 {
931 _PyUnicode_CONVERT_BYTES(
932 Py_UCS1, Py_UCS4,
933 PyUnicode_1BYTE_DATA(from) + from_start,
934 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
935 PyUnicode_4BYTE_DATA(to) + to_start
936 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200937 }
938 else if (from_kind == PyUnicode_2BYTE_KIND
939 && to_kind == PyUnicode_4BYTE_KIND)
940 {
941 _PyUnicode_CONVERT_BYTES(
942 Py_UCS2, Py_UCS4,
943 PyUnicode_2BYTE_DATA(from) + from_start,
944 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
945 PyUnicode_4BYTE_DATA(to) + to_start
946 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200947 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200948 else {
949 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200950
951 /* check if max_char(from substring) <= max_char(to) */
952 if (from_kind > to_kind
953 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +0200954 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +0200955 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +0200956 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200957 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200958 /* slow path to check for character overflow */
959 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
960 Py_UCS4 ch, maxchar;
961 Py_ssize_t i;
962
963 maxchar = 0;
964 invalid_kinds = 0;
965 for (i=0; i < how_many; i++) {
966 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
967 if (ch > maxchar) {
968 maxchar = ch;
969 if (maxchar > to_maxchar) {
970 invalid_kinds = 1;
971 break;
972 }
973 }
974 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
975 }
976 }
977 else
978 invalid_kinds = 1;
979 if (invalid_kinds) {
980 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200981 "Cannot copy %s characters "
982 "into a string of %s characters",
983 unicode_kind_name(from),
984 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200985 return -1;
986 }
987 }
988 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989}
990
Victor Stinner17222162011-09-28 22:15:37 +0200991/* Find the maximum code point and count the number of surrogate pairs so a
992 correct string length can be computed before converting a string to UCS4.
993 This function counts single surrogates as a character and not as a pair.
994
995 Return 0 on success, or -1 on error. */
996static int
997find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
998 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999{
1000 const wchar_t *iter;
1001
Victor Stinnerc53be962011-10-02 21:33:54 +02001002 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 if (num_surrogates == NULL || maxchar == NULL) {
1004 PyErr_SetString(PyExc_SystemError,
1005 "unexpected NULL arguments to "
1006 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1007 return -1;
1008 }
1009
1010 *num_surrogates = 0;
1011 *maxchar = 0;
1012
1013 for (iter = begin; iter < end; ) {
1014 if (*iter > *maxchar)
1015 *maxchar = *iter;
1016#if SIZEOF_WCHAR_T == 2
1017 if (*iter >= 0xD800 && *iter <= 0xDBFF
1018 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1019 {
1020 Py_UCS4 surrogate_val;
1021 surrogate_val = (((iter[0] & 0x3FF)<<10)
1022 | (iter[1] & 0x3FF)) + 0x10000;
1023 ++(*num_surrogates);
1024 if (surrogate_val > *maxchar)
1025 *maxchar = surrogate_val;
1026 iter += 2;
1027 }
1028 else
1029 iter++;
1030#else
1031 iter++;
1032#endif
1033 }
1034 return 0;
1035}
1036
1037#ifdef Py_DEBUG
1038int unicode_ready_calls = 0;
1039#endif
1040
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001041static int
1042unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001044 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 wchar_t *end;
1046 Py_UCS4 maxchar = 0;
1047 Py_ssize_t num_surrogates;
1048#if SIZEOF_WCHAR_T == 2
1049 Py_ssize_t length_wo_surrogates;
1050#endif
1051
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001052 assert(p_obj != NULL);
1053 unicode = (PyUnicodeObject *)*p_obj;
1054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001056 strings were created using _PyObject_New() and where no canonical
1057 representation (the str field) has been set yet aka strings
1058 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001059 assert(_PyUnicode_CHECK(unicode));
1060 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001062 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001063 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001064 /* Actually, it should neither be interned nor be anything else: */
1065 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066
1067#ifdef Py_DEBUG
1068 ++unicode_ready_calls;
1069#endif
1070
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001071#ifdef Py_DEBUG
1072 assert(!replace || Py_REFCNT(unicode) == 1);
1073#else
1074 if (replace && Py_REFCNT(unicode) != 1)
1075 replace = 0;
1076#endif
1077 if (replace) {
1078 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1079 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1080 /* Optimization for empty strings */
1081 if (len == 0) {
1082 Py_INCREF(unicode_empty);
1083 Py_DECREF(*p_obj);
1084 *p_obj = unicode_empty;
1085 return 0;
1086 }
1087 if (len == 1 && wstr[0] < 256) {
1088 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1089 if (latin1_char == NULL)
1090 return -1;
1091 Py_DECREF(*p_obj);
1092 *p_obj = latin1_char;
1093 return 0;
1094 }
1095 }
1096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001098 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001099 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101
1102 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001103 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1104 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 PyErr_NoMemory();
1106 return -1;
1107 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001108 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 _PyUnicode_WSTR(unicode), end,
1110 PyUnicode_1BYTE_DATA(unicode));
1111 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1112 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1113 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1114 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001115 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001116 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001117 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 }
1119 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001120 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001121 _PyUnicode_UTF8(unicode) = NULL;
1122 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 }
1124 PyObject_FREE(_PyUnicode_WSTR(unicode));
1125 _PyUnicode_WSTR(unicode) = NULL;
1126 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1127 }
1128 /* In this case we might have to convert down from 4-byte native
1129 wchar_t to 2-byte unicode. */
1130 else if (maxchar < 65536) {
1131 assert(num_surrogates == 0 &&
1132 "FindMaxCharAndNumSurrogatePairs() messed up");
1133
Victor Stinner506f5922011-09-28 22:34:18 +02001134#if SIZEOF_WCHAR_T == 2
1135 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001136 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001137 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1138 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1139 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001140 _PyUnicode_UTF8(unicode) = NULL;
1141 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001142#else
1143 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001144 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001145 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001146 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001147 PyErr_NoMemory();
1148 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001149 }
Victor Stinner506f5922011-09-28 22:34:18 +02001150 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1151 _PyUnicode_WSTR(unicode), end,
1152 PyUnicode_2BYTE_DATA(unicode));
1153 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1154 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1155 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001156 _PyUnicode_UTF8(unicode) = NULL;
1157 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001158 PyObject_FREE(_PyUnicode_WSTR(unicode));
1159 _PyUnicode_WSTR(unicode) = NULL;
1160 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1161#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 }
1163 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1164 else {
1165#if SIZEOF_WCHAR_T == 2
1166 /* in case the native representation is 2-bytes, we need to allocate a
1167 new normalized 4-byte version. */
1168 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001169 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1170 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 PyErr_NoMemory();
1172 return -1;
1173 }
1174 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001176 _PyUnicode_UTF8(unicode) = NULL;
1177 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001178 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1179 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001180 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 PyObject_FREE(_PyUnicode_WSTR(unicode));
1182 _PyUnicode_WSTR(unicode) = NULL;
1183 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1184#else
1185 assert(num_surrogates == 0);
1186
Victor Stinnerc3c74152011-10-02 20:39:55 +02001187 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001189 _PyUnicode_UTF8(unicode) = NULL;
1190 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1192#endif
1193 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1194 }
1195 _PyUnicode_STATE(unicode).ready = 1;
1196 return 0;
1197}
1198
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001199int
1200_PyUnicode_ReadyReplace(PyObject **op)
1201{
1202 return unicode_ready(op, 1);
1203}
1204
1205int
1206_PyUnicode_Ready(PyObject *op)
1207{
1208 return unicode_ready(&op, 0);
1209}
1210
Alexander Belopolsky40018472011-02-26 01:02:56 +00001211static void
1212unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213{
Walter Dörwald16807132007-05-25 13:52:07 +00001214 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001215 case SSTATE_NOT_INTERNED:
1216 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001217
Benjamin Peterson29060642009-01-31 22:14:21 +00001218 case SSTATE_INTERNED_MORTAL:
1219 /* revive dead object temporarily for DelItem */
1220 Py_REFCNT(unicode) = 3;
1221 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1222 Py_FatalError(
1223 "deletion of interned string failed");
1224 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001225
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 case SSTATE_INTERNED_IMMORTAL:
1227 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001228
Benjamin Peterson29060642009-01-31 22:14:21 +00001229 default:
1230 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001231 }
1232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 if (_PyUnicode_WSTR(unicode) &&
1234 (!PyUnicode_IS_READY(unicode) ||
1235 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1236 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001237 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239
1240 if (PyUnicode_IS_COMPACT(unicode)) {
1241 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 }
1243 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001244 if (_PyUnicode_DATA_ANY(unicode))
1245 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001246 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 }
1248}
1249
Alexander Belopolsky40018472011-02-26 01:02:56 +00001250static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001251unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001252{
Victor Stinnera3be6132011-10-03 02:16:37 +02001253 Py_ssize_t len;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001254 if (Py_REFCNT(unicode) != 1)
1255 return 0;
1256 if (PyUnicode_CHECK_INTERNED(unicode))
1257 return 0;
1258 if (unicode == unicode_empty)
1259 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001260 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1261 len = PyUnicode_WSTR_LENGTH(unicode);
1262 else
1263 len = PyUnicode_GET_LENGTH(unicode);
1264 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001265 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001266 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001267 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001268 else
1269 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001270 if (ch < 256 && unicode_latin1[ch] == unicode)
1271 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001272 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001273 return 1;
1274}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001275
Victor Stinnerfe226c02011-10-03 03:52:20 +02001276static int
1277unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1278{
1279 PyObject *unicode;
1280 Py_ssize_t old_length;
1281
1282 assert(p_unicode != NULL);
1283 unicode = *p_unicode;
1284
1285 assert(unicode != NULL);
1286 assert(PyUnicode_Check(unicode));
1287 assert(0 <= length);
1288
Victor Stinner910337b2011-10-03 03:20:16 +02001289 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001290 old_length = PyUnicode_WSTR_LENGTH(unicode);
1291 else
1292 old_length = PyUnicode_GET_LENGTH(unicode);
1293 if (old_length == length)
1294 return 0;
1295
1296 /* FIXME: really create a new object? */
1297 if (!unicode_resizable(unicode)) {
1298 PyObject *copy = resize_copy(unicode, length);
1299 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001300 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001301 Py_DECREF(*p_unicode);
1302 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001303 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001304 }
1305
Victor Stinnerfe226c02011-10-03 03:52:20 +02001306 if (PyUnicode_IS_COMPACT(unicode)) {
1307 *p_unicode = resize_compact(unicode, length);
1308 if (*p_unicode == NULL)
1309 return -1;
1310 return 0;
1311 } else
1312 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001313}
1314
Alexander Belopolsky40018472011-02-26 01:02:56 +00001315int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001316PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001317{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001318 PyObject *unicode;
1319 if (p_unicode == NULL) {
1320 PyErr_BadInternalCall();
1321 return -1;
1322 }
1323 unicode = *p_unicode;
1324 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1325 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1326 {
1327 PyErr_BadInternalCall();
1328 return -1;
1329 }
1330 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001331}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333static PyObject*
1334get_latin1_char(unsigned char ch)
1335{
Victor Stinnera464fc12011-10-02 20:39:30 +02001336 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001338 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 if (!unicode)
1340 return NULL;
1341 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1342 unicode_latin1[ch] = unicode;
1343 }
1344 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001345 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346}
1347
Alexander Belopolsky40018472011-02-26 01:02:56 +00001348PyObject *
1349PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350{
1351 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352 Py_UCS4 maxchar = 0;
1353 Py_ssize_t num_surrogates;
1354
1355 if (u == NULL)
1356 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001358 /* If the Unicode data is known at construction time, we can apply
1359 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 /* Optimization for empty strings */
1362 if (size == 0 && unicode_empty != NULL) {
1363 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001364 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001365 }
Tim Petersced69f82003-09-16 20:30:58 +00001366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 /* Single character Unicode objects in the Latin-1 range are
1368 shared when using this constructor */
1369 if (size == 1 && *u < 256)
1370 return get_latin1_char((unsigned char)*u);
1371
1372 /* If not empty and not single character, copy the Unicode data
1373 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 if (find_maxchar_surrogates(u, u + size,
1375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return NULL;
1377
1378 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1379 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 if (!unicode)
1381 return NULL;
1382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 switch (PyUnicode_KIND(unicode)) {
1384 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001385 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1387 break;
1388 case PyUnicode_2BYTE_KIND:
1389#if Py_UNICODE_SIZE == 2
1390 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1391#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001392 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1394#endif
1395 break;
1396 case PyUnicode_4BYTE_KIND:
1397#if SIZEOF_WCHAR_T == 2
1398 /* This is the only case which has to process surrogates, thus
1399 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001400 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401#else
1402 assert(num_surrogates == 0);
1403 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1404#endif
1405 break;
1406 default:
1407 assert(0 && "Impossible state");
1408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409
1410 return (PyObject *)unicode;
1411}
1412
Alexander Belopolsky40018472011-02-26 01:02:56 +00001413PyObject *
1414PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001415{
1416 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001417
Benjamin Peterson14339b62009-01-31 16:36:08 +00001418 if (size < 0) {
1419 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001420 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001421 return NULL;
1422 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001423
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001424 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001425 some optimizations which share commonly used objects.
1426 Also, this means the input must be UTF-8, so fall back to the
1427 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001428 if (u != NULL) {
1429
Benjamin Peterson29060642009-01-31 22:14:21 +00001430 /* Optimization for empty strings */
1431 if (size == 0 && unicode_empty != NULL) {
1432 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001433 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001434 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001435
1436 /* Single characters are shared when using this constructor.
1437 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 if (size == 1 && Py_CHARMASK(*u) < 128)
1439 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001440
1441 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001442 }
1443
Walter Dörwald55507312007-05-18 13:12:10 +00001444 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001445 if (!unicode)
1446 return NULL;
1447
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001448 return (PyObject *)unicode;
1449}
1450
Alexander Belopolsky40018472011-02-26 01:02:56 +00001451PyObject *
1452PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001453{
1454 size_t size = strlen(u);
1455 if (size > PY_SSIZE_T_MAX) {
1456 PyErr_SetString(PyExc_OverflowError, "input too long");
1457 return NULL;
1458 }
1459
1460 return PyUnicode_FromStringAndSize(u, size);
1461}
1462
Victor Stinnere57b1c02011-09-28 22:20:48 +02001463static PyObject*
1464_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 PyObject *res;
1467 unsigned char max = 127;
1468 Py_ssize_t i;
1469 for (i = 0; i < size; i++) {
1470 if (u[i] & 0x80) {
1471 max = 255;
1472 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001473 }
1474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 res = PyUnicode_New(size, max);
1476 if (!res)
1477 return NULL;
1478 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1479 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001480}
1481
Victor Stinnere57b1c02011-09-28 22:20:48 +02001482static PyObject*
1483_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484{
1485 PyObject *res;
1486 Py_UCS2 max = 0;
1487 Py_ssize_t i;
1488 for (i = 0; i < size; i++)
1489 if (u[i] > max)
1490 max = u[i];
1491 res = PyUnicode_New(size, max);
1492 if (!res)
1493 return NULL;
1494 if (max >= 256)
1495 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1496 else
1497 for (i = 0; i < size; i++)
1498 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1499 return res;
1500}
1501
Victor Stinnere57b1c02011-09-28 22:20:48 +02001502static PyObject*
1503_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504{
1505 PyObject *res;
1506 Py_UCS4 max = 0;
1507 Py_ssize_t i;
1508 for (i = 0; i < size; i++)
1509 if (u[i] > max)
1510 max = u[i];
1511 res = PyUnicode_New(size, max);
1512 if (!res)
1513 return NULL;
1514 if (max >= 0x10000)
1515 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1516 else {
1517 int kind = PyUnicode_KIND(res);
1518 void *data = PyUnicode_DATA(res);
1519 for (i = 0; i < size; i++)
1520 PyUnicode_WRITE(kind, data, i, u[i]);
1521 }
1522 return res;
1523}
1524
1525PyObject*
1526PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1527{
1528 switch(kind) {
1529 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001530 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001532 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001534 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001536 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 return NULL;
1538}
1539
Victor Stinner034f6cf2011-09-30 02:26:44 +02001540PyObject*
1541PyUnicode_Copy(PyObject *unicode)
1542{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001543 Py_ssize_t size;
1544 PyObject *copy;
1545 void *data;
1546
Victor Stinner034f6cf2011-09-30 02:26:44 +02001547 if (!PyUnicode_Check(unicode)) {
1548 PyErr_BadInternalCall();
1549 return NULL;
1550 }
1551 if (PyUnicode_READY(unicode))
1552 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001553
1554 size = PyUnicode_GET_LENGTH(unicode);
1555 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1556 if (!copy)
1557 return NULL;
1558 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1559
1560 data = PyUnicode_DATA(unicode);
1561 switch (PyUnicode_KIND(unicode))
1562 {
1563 case PyUnicode_1BYTE_KIND:
1564 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1565 break;
1566 case PyUnicode_2BYTE_KIND:
1567 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1568 break;
1569 case PyUnicode_4BYTE_KIND:
1570 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1571 break;
1572 default:
1573 assert(0);
1574 break;
1575 }
1576 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001577}
1578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579
Victor Stinnerbc603d12011-10-02 01:00:40 +02001580/* Widen Unicode objects to larger buffers. Don't write terminating null
1581 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001582
1583void*
1584_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1585{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001586 Py_ssize_t len;
1587 void *result;
1588 unsigned int skind;
1589
1590 if (PyUnicode_READY(s))
1591 return NULL;
1592
1593 len = PyUnicode_GET_LENGTH(s);
1594 skind = PyUnicode_KIND(s);
1595 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001596 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1597 return NULL;
1598 }
1599 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001600 case PyUnicode_2BYTE_KIND:
1601 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1602 if (!result)
1603 return PyErr_NoMemory();
1604 assert(skind == PyUnicode_1BYTE_KIND);
1605 _PyUnicode_CONVERT_BYTES(
1606 Py_UCS1, Py_UCS2,
1607 PyUnicode_1BYTE_DATA(s),
1608 PyUnicode_1BYTE_DATA(s) + len,
1609 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001610 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001611 case PyUnicode_4BYTE_KIND:
1612 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1613 if (!result)
1614 return PyErr_NoMemory();
1615 if (skind == PyUnicode_2BYTE_KIND) {
1616 _PyUnicode_CONVERT_BYTES(
1617 Py_UCS2, Py_UCS4,
1618 PyUnicode_2BYTE_DATA(s),
1619 PyUnicode_2BYTE_DATA(s) + len,
1620 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001622 else {
1623 assert(skind == PyUnicode_1BYTE_KIND);
1624 _PyUnicode_CONVERT_BYTES(
1625 Py_UCS1, Py_UCS4,
1626 PyUnicode_1BYTE_DATA(s),
1627 PyUnicode_1BYTE_DATA(s) + len,
1628 result);
1629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001631 default:
1632 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001634 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 return NULL;
1636}
1637
1638static Py_UCS4*
1639as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1640 int copy_null)
1641{
1642 int kind;
1643 void *data;
1644 Py_ssize_t len, targetlen;
1645 if (PyUnicode_READY(string) == -1)
1646 return NULL;
1647 kind = PyUnicode_KIND(string);
1648 data = PyUnicode_DATA(string);
1649 len = PyUnicode_GET_LENGTH(string);
1650 targetlen = len;
1651 if (copy_null)
1652 targetlen++;
1653 if (!target) {
1654 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1655 PyErr_NoMemory();
1656 return NULL;
1657 }
1658 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1659 if (!target) {
1660 PyErr_NoMemory();
1661 return NULL;
1662 }
1663 }
1664 else {
1665 if (targetsize < targetlen) {
1666 PyErr_Format(PyExc_SystemError,
1667 "string is longer than the buffer");
1668 if (copy_null && 0 < targetsize)
1669 target[0] = 0;
1670 return NULL;
1671 }
1672 }
1673 if (kind != PyUnicode_4BYTE_KIND) {
1674 Py_ssize_t i;
1675 for (i = 0; i < len; i++)
1676 target[i] = PyUnicode_READ(kind, data, i);
1677 }
1678 else
1679 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1680 if (copy_null)
1681 target[len] = 0;
1682 return target;
1683}
1684
1685Py_UCS4*
1686PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1687 int copy_null)
1688{
1689 if (target == NULL || targetsize < 1) {
1690 PyErr_BadInternalCall();
1691 return NULL;
1692 }
1693 return as_ucs4(string, target, targetsize, copy_null);
1694}
1695
1696Py_UCS4*
1697PyUnicode_AsUCS4Copy(PyObject *string)
1698{
1699 return as_ucs4(string, NULL, 0, 1);
1700}
1701
1702#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001703
Alexander Belopolsky40018472011-02-26 01:02:56 +00001704PyObject *
1705PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001708 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001710 PyErr_BadInternalCall();
1711 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 }
1713
Martin v. Löwis790465f2008-04-05 20:41:37 +00001714 if (size == -1) {
1715 size = wcslen(w);
1716 }
1717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719}
1720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001722
Walter Dörwald346737f2007-05-31 10:44:43 +00001723static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001724makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1725 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001726{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001727 *fmt++ = '%';
1728 if (width) {
1729 if (zeropad)
1730 *fmt++ = '0';
1731 fmt += sprintf(fmt, "%d", width);
1732 }
1733 if (precision)
1734 fmt += sprintf(fmt, ".%d", precision);
1735 if (longflag)
1736 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001737 else if (longlongflag) {
1738 /* longlongflag should only ever be nonzero on machines with
1739 HAVE_LONG_LONG defined */
1740#ifdef HAVE_LONG_LONG
1741 char *f = PY_FORMAT_LONG_LONG;
1742 while (*f)
1743 *fmt++ = *f++;
1744#else
1745 /* we shouldn't ever get here */
1746 assert(0);
1747 *fmt++ = 'l';
1748#endif
1749 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001750 else if (size_tflag) {
1751 char *f = PY_FORMAT_SIZE_T;
1752 while (*f)
1753 *fmt++ = *f++;
1754 }
1755 *fmt++ = c;
1756 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001757}
1758
Victor Stinner96865452011-03-01 23:44:09 +00001759/* helper for PyUnicode_FromFormatV() */
1760
1761static const char*
1762parse_format_flags(const char *f,
1763 int *p_width, int *p_precision,
1764 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1765{
1766 int width, precision, longflag, longlongflag, size_tflag;
1767
1768 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1769 f++;
1770 width = 0;
1771 while (Py_ISDIGIT((unsigned)*f))
1772 width = (width*10) + *f++ - '0';
1773 precision = 0;
1774 if (*f == '.') {
1775 f++;
1776 while (Py_ISDIGIT((unsigned)*f))
1777 precision = (precision*10) + *f++ - '0';
1778 if (*f == '%') {
1779 /* "%.3%s" => f points to "3" */
1780 f--;
1781 }
1782 }
1783 if (*f == '\0') {
1784 /* bogus format "%.1" => go backward, f points to "1" */
1785 f--;
1786 }
1787 if (p_width != NULL)
1788 *p_width = width;
1789 if (p_precision != NULL)
1790 *p_precision = precision;
1791
1792 /* Handle %ld, %lu, %lld and %llu. */
1793 longflag = 0;
1794 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001795 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001796
1797 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001798 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001799 longflag = 1;
1800 ++f;
1801 }
1802#ifdef HAVE_LONG_LONG
1803 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001804 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001805 longlongflag = 1;
1806 f += 2;
1807 }
1808#endif
1809 }
1810 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001811 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001812 size_tflag = 1;
1813 ++f;
1814 }
1815 if (p_longflag != NULL)
1816 *p_longflag = longflag;
1817 if (p_longlongflag != NULL)
1818 *p_longlongflag = longlongflag;
1819 if (p_size_tflag != NULL)
1820 *p_size_tflag = size_tflag;
1821 return f;
1822}
1823
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001824/* maximum number of characters required for output of %ld. 21 characters
1825 allows for 64-bit integers (in decimal) and an optional sign. */
1826#define MAX_LONG_CHARS 21
1827/* maximum number of characters required for output of %lld.
1828 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1829 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1830#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1831
Walter Dörwaldd2034312007-05-18 16:29:38 +00001832PyObject *
1833PyUnicode_FromFormatV(const char *format, va_list vargs)
1834{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001835 va_list count;
1836 Py_ssize_t callcount = 0;
1837 PyObject **callresults = NULL;
1838 PyObject **callresult = NULL;
1839 Py_ssize_t n = 0;
1840 int width = 0;
1841 int precision = 0;
1842 int zeropad;
1843 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001845 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001846 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1848 Py_UCS4 argmaxchar;
1849 Py_ssize_t numbersize = 0;
1850 char *numberresults = NULL;
1851 char *numberresult = NULL;
1852 Py_ssize_t i;
1853 int kind;
1854 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001855
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001856 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001857 /* step 1: count the number of %S/%R/%A/%s format specifications
1858 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1859 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 * result in an array)
1861 * also esimate a upper bound for all the number formats in the string,
1862 * numbers will be formated in step 3 and be keept in a '\0'-separated
1863 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001864 for (f = format; *f; f++) {
1865 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001866 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1868 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1869 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1870 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001873#ifdef HAVE_LONG_LONG
1874 if (longlongflag) {
1875 if (width < MAX_LONG_LONG_CHARS)
1876 width = MAX_LONG_LONG_CHARS;
1877 }
1878 else
1879#endif
1880 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1881 including sign. Decimal takes the most space. This
1882 isn't enough for octal. If a width is specified we
1883 need more (which we allocate later). */
1884 if (width < MAX_LONG_CHARS)
1885 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886
1887 /* account for the size + '\0' to separate numbers
1888 inside of the numberresults buffer */
1889 numbersize += (width + 1);
1890 }
1891 }
1892 else if ((unsigned char)*f > 127) {
1893 PyErr_Format(PyExc_ValueError,
1894 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1895 "string, got a non-ASCII byte: 0x%02x",
1896 (unsigned char)*f);
1897 return NULL;
1898 }
1899 }
1900 /* step 2: allocate memory for the results of
1901 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1902 if (callcount) {
1903 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1904 if (!callresults) {
1905 PyErr_NoMemory();
1906 return NULL;
1907 }
1908 callresult = callresults;
1909 }
1910 /* step 2.5: allocate memory for the results of formating numbers */
1911 if (numbersize) {
1912 numberresults = PyObject_Malloc(numbersize);
1913 if (!numberresults) {
1914 PyErr_NoMemory();
1915 goto fail;
1916 }
1917 numberresult = numberresults;
1918 }
1919
1920 /* step 3: format numbers and figure out how large a buffer we need */
1921 for (f = format; *f; f++) {
1922 if (*f == '%') {
1923 const char* p;
1924 int longflag;
1925 int longlongflag;
1926 int size_tflag;
1927 int numprinted;
1928
1929 p = f;
1930 zeropad = (f[1] == '0');
1931 f = parse_format_flags(f, &width, &precision,
1932 &longflag, &longlongflag, &size_tflag);
1933 switch (*f) {
1934 case 'c':
1935 {
1936 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001937 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001938 n++;
1939 break;
1940 }
1941 case '%':
1942 n++;
1943 break;
1944 case 'i':
1945 case 'd':
1946 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1947 width, precision, *f);
1948 if (longflag)
1949 numprinted = sprintf(numberresult, fmt,
1950 va_arg(count, long));
1951#ifdef HAVE_LONG_LONG
1952 else if (longlongflag)
1953 numprinted = sprintf(numberresult, fmt,
1954 va_arg(count, PY_LONG_LONG));
1955#endif
1956 else if (size_tflag)
1957 numprinted = sprintf(numberresult, fmt,
1958 va_arg(count, Py_ssize_t));
1959 else
1960 numprinted = sprintf(numberresult, fmt,
1961 va_arg(count, int));
1962 n += numprinted;
1963 /* advance by +1 to skip over the '\0' */
1964 numberresult += (numprinted + 1);
1965 assert(*(numberresult - 1) == '\0');
1966 assert(*(numberresult - 2) != '\0');
1967 assert(numprinted >= 0);
1968 assert(numberresult <= numberresults + numbersize);
1969 break;
1970 case 'u':
1971 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1972 width, precision, 'u');
1973 if (longflag)
1974 numprinted = sprintf(numberresult, fmt,
1975 va_arg(count, unsigned long));
1976#ifdef HAVE_LONG_LONG
1977 else if (longlongflag)
1978 numprinted = sprintf(numberresult, fmt,
1979 va_arg(count, unsigned PY_LONG_LONG));
1980#endif
1981 else if (size_tflag)
1982 numprinted = sprintf(numberresult, fmt,
1983 va_arg(count, size_t));
1984 else
1985 numprinted = sprintf(numberresult, fmt,
1986 va_arg(count, unsigned int));
1987 n += numprinted;
1988 numberresult += (numprinted + 1);
1989 assert(*(numberresult - 1) == '\0');
1990 assert(*(numberresult - 2) != '\0');
1991 assert(numprinted >= 0);
1992 assert(numberresult <= numberresults + numbersize);
1993 break;
1994 case 'x':
1995 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1996 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1997 n += numprinted;
1998 numberresult += (numprinted + 1);
1999 assert(*(numberresult - 1) == '\0');
2000 assert(*(numberresult - 2) != '\0');
2001 assert(numprinted >= 0);
2002 assert(numberresult <= numberresults + numbersize);
2003 break;
2004 case 'p':
2005 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2006 /* %p is ill-defined: ensure leading 0x. */
2007 if (numberresult[1] == 'X')
2008 numberresult[1] = 'x';
2009 else if (numberresult[1] != 'x') {
2010 memmove(numberresult + 2, numberresult,
2011 strlen(numberresult) + 1);
2012 numberresult[0] = '0';
2013 numberresult[1] = 'x';
2014 numprinted += 2;
2015 }
2016 n += numprinted;
2017 numberresult += (numprinted + 1);
2018 assert(*(numberresult - 1) == '\0');
2019 assert(*(numberresult - 2) != '\0');
2020 assert(numprinted >= 0);
2021 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002022 break;
2023 case 's':
2024 {
2025 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002026 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002027 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2028 if (!str)
2029 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 /* since PyUnicode_DecodeUTF8 returns already flexible
2031 unicode objects, there is no need to call ready on them */
2032 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002033 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002035 /* Remember the str and switch to the next slot */
2036 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002037 break;
2038 }
2039 case 'U':
2040 {
2041 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002042 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 if (PyUnicode_READY(obj) == -1)
2044 goto fail;
2045 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002046 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002048 break;
2049 }
2050 case 'V':
2051 {
2052 PyObject *obj = va_arg(count, PyObject *);
2053 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002054 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002056 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002057 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 if (PyUnicode_READY(obj) == -1)
2059 goto fail;
2060 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002061 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002063 *callresult++ = NULL;
2064 }
2065 else {
2066 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2067 if (!str_obj)
2068 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002070 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002072 *callresult++ = str_obj;
2073 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002074 break;
2075 }
2076 case 'S':
2077 {
2078 PyObject *obj = va_arg(count, PyObject *);
2079 PyObject *str;
2080 assert(obj);
2081 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002083 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002085 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002086 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002087 /* Remember the str and switch to the next slot */
2088 *callresult++ = str;
2089 break;
2090 }
2091 case 'R':
2092 {
2093 PyObject *obj = va_arg(count, PyObject *);
2094 PyObject *repr;
2095 assert(obj);
2096 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002100 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 /* Remember the repr and switch to the next slot */
2103 *callresult++ = repr;
2104 break;
2105 }
2106 case 'A':
2107 {
2108 PyObject *obj = va_arg(count, PyObject *);
2109 PyObject *ascii;
2110 assert(obj);
2111 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002112 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002113 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002115 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002116 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002117 /* Remember the repr and switch to the next slot */
2118 *callresult++ = ascii;
2119 break;
2120 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002121 default:
2122 /* if we stumble upon an unknown
2123 formatting code, copy the rest of
2124 the format string to the output
2125 string. (we cannot just skip the
2126 code, since there's no way to know
2127 what's in the argument list) */
2128 n += strlen(p);
2129 goto expand;
2130 }
2131 } else
2132 n++;
2133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002134 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002135 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 we don't have to resize the string.
2138 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002140 if (!string)
2141 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142 kind = PyUnicode_KIND(string);
2143 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002148 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002149 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002150
2151 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2153 /* checking for == because the last argument could be a empty
2154 string, which causes i to point to end, the assert at the end of
2155 the loop */
2156 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002157
Benjamin Peterson14339b62009-01-31 16:36:08 +00002158 switch (*f) {
2159 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002160 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 const int ordinal = va_arg(vargs, int);
2162 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002164 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002165 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002168 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 case 'p':
2170 /* unused, since we already have the result */
2171 if (*f == 'p')
2172 (void) va_arg(vargs, void *);
2173 else
2174 (void) va_arg(vargs, int);
2175 /* extract the result from numberresults and append. */
2176 for (; *numberresult; ++i, ++numberresult)
2177 PyUnicode_WRITE(kind, data, i, *numberresult);
2178 /* skip over the separating '\0' */
2179 assert(*numberresult == '\0');
2180 numberresult++;
2181 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002182 break;
2183 case 's':
2184 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002185 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002187 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 size = PyUnicode_GET_LENGTH(*callresult);
2189 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002190 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2191 *callresult, 0,
2192 size) < 0)
2193 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002195 /* We're done with the unicode()/repr() => forget it */
2196 Py_DECREF(*callresult);
2197 /* switch to next unicode()/repr() result */
2198 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 break;
2200 }
2201 case 'U':
2202 {
2203 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 Py_ssize_t size;
2205 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2206 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002207 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2208 obj, 0,
2209 size) < 0)
2210 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 break;
2213 }
2214 case 'V':
2215 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002218 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 size = PyUnicode_GET_LENGTH(obj);
2221 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002222 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2223 obj, 0,
2224 size) < 0)
2225 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002227 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 size = PyUnicode_GET_LENGTH(*callresult);
2229 assert(PyUnicode_KIND(*callresult) <=
2230 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002231 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2232 *callresult,
2233 0, size) < 0)
2234 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002236 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002237 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002238 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 break;
2240 }
2241 case 'S':
2242 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002243 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 /* unused, since we already have the result */
2246 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002248 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2249 *callresult, 0,
2250 PyUnicode_GET_LENGTH(*callresult)) < 0)
2251 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 /* We're done with the unicode()/repr() => forget it */
2254 Py_DECREF(*callresult);
2255 /* switch to next unicode()/repr() result */
2256 ++callresult;
2257 break;
2258 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002259 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002261 break;
2262 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 for (; *p; ++p, ++i)
2264 PyUnicode_WRITE(kind, data, i, *p);
2265 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 goto end;
2267 }
Victor Stinner1205f272010-09-11 00:54:47 +00002268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 else {
2270 assert(i < PyUnicode_GET_LENGTH(string));
2271 PyUnicode_WRITE(kind, data, i++, *f);
2272 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002275
Benjamin Peterson29060642009-01-31 22:14:21 +00002276 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 if (callresults)
2278 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 if (numberresults)
2280 PyObject_Free(numberresults);
2281 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002282 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 if (callresults) {
2284 PyObject **callresult2 = callresults;
2285 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002286 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002287 ++callresult2;
2288 }
2289 PyObject_Free(callresults);
2290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 if (numberresults)
2292 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002294}
2295
Walter Dörwaldd2034312007-05-18 16:29:38 +00002296PyObject *
2297PyUnicode_FromFormat(const char *format, ...)
2298{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002299 PyObject* ret;
2300 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301
2302#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002303 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002304#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002305 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002306#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 ret = PyUnicode_FromFormatV(format, vargs);
2308 va_end(vargs);
2309 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#ifdef HAVE_WCHAR_H
2313
Victor Stinner5593d8a2010-10-02 11:11:27 +00002314/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2315 convert a Unicode object to a wide character string.
2316
Victor Stinnerd88d9832011-09-06 02:00:05 +02002317 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002318 character) required to convert the unicode object. Ignore size argument.
2319
Victor Stinnerd88d9832011-09-06 02:00:05 +02002320 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002321 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002322 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002323static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002324unicode_aswidechar(PyUnicodeObject *unicode,
2325 wchar_t *w,
2326 Py_ssize_t size)
2327{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002328 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 const wchar_t *wstr;
2330
2331 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2332 if (wstr == NULL)
2333 return -1;
2334
Victor Stinner5593d8a2010-10-02 11:11:27 +00002335 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002336 if (size > res)
2337 size = res + 1;
2338 else
2339 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002341 return res;
2342 }
2343 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002345}
2346
2347Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002348PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002349 wchar_t *w,
2350 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351{
2352 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002353 PyErr_BadInternalCall();
2354 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002356 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357}
2358
Victor Stinner137c34c2010-09-29 10:25:54 +00002359wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002360PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002361 Py_ssize_t *size)
2362{
2363 wchar_t* buffer;
2364 Py_ssize_t buflen;
2365
2366 if (unicode == NULL) {
2367 PyErr_BadInternalCall();
2368 return NULL;
2369 }
2370
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002371 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002372 if (buflen == -1)
2373 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002374 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002375 PyErr_NoMemory();
2376 return NULL;
2377 }
2378
Victor Stinner137c34c2010-09-29 10:25:54 +00002379 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2380 if (buffer == NULL) {
2381 PyErr_NoMemory();
2382 return NULL;
2383 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002384 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002385 if (buflen == -1)
2386 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002387 if (size != NULL)
2388 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002389 return buffer;
2390}
2391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393
Alexander Belopolsky40018472011-02-26 01:02:56 +00002394PyObject *
2395PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002398 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002399 PyErr_SetString(PyExc_ValueError,
2400 "chr() arg not in range(0x110000)");
2401 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002402 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 if (ordinal < 256)
2405 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002407 v = PyUnicode_New(1, ordinal);
2408 if (v == NULL)
2409 return NULL;
2410 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2411 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002412}
2413
Alexander Belopolsky40018472011-02-26 01:02:56 +00002414PyObject *
2415PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002416{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002417 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002418 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002419 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002420 if (PyUnicode_READY(obj))
2421 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002422 Py_INCREF(obj);
2423 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002424 }
2425 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002426 /* For a Unicode subtype that's not a Unicode object,
2427 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002428 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002429 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002430 PyErr_Format(PyExc_TypeError,
2431 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002432 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002433 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002434}
2435
Alexander Belopolsky40018472011-02-26 01:02:56 +00002436PyObject *
2437PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002438 const char *encoding,
2439 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002440{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002441 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002442 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002443
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002445 PyErr_BadInternalCall();
2446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002448
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002449 /* Decoding bytes objects is the most common case and should be fast */
2450 if (PyBytes_Check(obj)) {
2451 if (PyBytes_GET_SIZE(obj) == 0) {
2452 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002453 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002454 }
2455 else {
2456 v = PyUnicode_Decode(
2457 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2458 encoding, errors);
2459 }
2460 return v;
2461 }
2462
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002463 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002464 PyErr_SetString(PyExc_TypeError,
2465 "decoding str is not supported");
2466 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002467 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002468
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002469 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2470 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2471 PyErr_Format(PyExc_TypeError,
2472 "coercing to str: need bytes, bytearray "
2473 "or buffer-like object, %.80s found",
2474 Py_TYPE(obj)->tp_name);
2475 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002476 }
Tim Petersced69f82003-09-16 20:30:58 +00002477
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002478 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002479 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002480 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002481 }
Tim Petersced69f82003-09-16 20:30:58 +00002482 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002483 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002484
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002485 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002486 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487}
2488
Victor Stinner600d3be2010-06-10 12:00:55 +00002489/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002490 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2491 1 on success. */
2492static int
2493normalize_encoding(const char *encoding,
2494 char *lower,
2495 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002497 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002498 char *l;
2499 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002501 e = encoding;
2502 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002503 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002504 while (*e) {
2505 if (l == l_end)
2506 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002507 if (Py_ISUPPER(*e)) {
2508 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002509 }
2510 else if (*e == '_') {
2511 *l++ = '-';
2512 e++;
2513 }
2514 else {
2515 *l++ = *e++;
2516 }
2517 }
2518 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002519 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002520}
2521
Alexander Belopolsky40018472011-02-26 01:02:56 +00002522PyObject *
2523PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002524 Py_ssize_t size,
2525 const char *encoding,
2526 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002527{
2528 PyObject *buffer = NULL, *unicode;
2529 Py_buffer info;
2530 char lower[11]; /* Enough for any encoding shortcut */
2531
2532 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002533 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002534
2535 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002536 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002537 if ((strcmp(lower, "utf-8") == 0) ||
2538 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002539 return PyUnicode_DecodeUTF8(s, size, errors);
2540 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002541 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002542 (strcmp(lower, "iso-8859-1") == 0))
2543 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002544#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002545 else if (strcmp(lower, "mbcs") == 0)
2546 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002547#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002548 else if (strcmp(lower, "ascii") == 0)
2549 return PyUnicode_DecodeASCII(s, size, errors);
2550 else if (strcmp(lower, "utf-16") == 0)
2551 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2552 else if (strcmp(lower, "utf-32") == 0)
2553 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555
2556 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002557 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002558 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002559 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002560 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 if (buffer == NULL)
2562 goto onError;
2563 unicode = PyCodec_Decode(buffer, encoding, errors);
2564 if (unicode == NULL)
2565 goto onError;
2566 if (!PyUnicode_Check(unicode)) {
2567 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002568 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002569 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 Py_DECREF(unicode);
2571 goto onError;
2572 }
2573 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002574 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 Py_DECREF(unicode);
2576 return NULL;
2577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002579
Benjamin Peterson29060642009-01-31 22:14:21 +00002580 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 Py_XDECREF(buffer);
2582 return NULL;
2583}
2584
Alexander Belopolsky40018472011-02-26 01:02:56 +00002585PyObject *
2586PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002587 const char *encoding,
2588 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002589{
2590 PyObject *v;
2591
2592 if (!PyUnicode_Check(unicode)) {
2593 PyErr_BadArgument();
2594 goto onError;
2595 }
2596
2597 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002598 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002599
2600 /* Decode via the codec registry */
2601 v = PyCodec_Decode(unicode, encoding, errors);
2602 if (v == NULL)
2603 goto onError;
2604 return v;
2605
Benjamin Peterson29060642009-01-31 22:14:21 +00002606 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002607 return NULL;
2608}
2609
Alexander Belopolsky40018472011-02-26 01:02:56 +00002610PyObject *
2611PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002612 const char *encoding,
2613 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002614{
2615 PyObject *v;
2616
2617 if (!PyUnicode_Check(unicode)) {
2618 PyErr_BadArgument();
2619 goto onError;
2620 }
2621
2622 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002623 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002624
2625 /* Decode via the codec registry */
2626 v = PyCodec_Decode(unicode, encoding, errors);
2627 if (v == NULL)
2628 goto onError;
2629 if (!PyUnicode_Check(v)) {
2630 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002631 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002632 Py_TYPE(v)->tp_name);
2633 Py_DECREF(v);
2634 goto onError;
2635 }
2636 return v;
2637
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002639 return NULL;
2640}
2641
Alexander Belopolsky40018472011-02-26 01:02:56 +00002642PyObject *
2643PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002644 Py_ssize_t size,
2645 const char *encoding,
2646 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647{
2648 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002649
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 unicode = PyUnicode_FromUnicode(s, size);
2651 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2654 Py_DECREF(unicode);
2655 return v;
2656}
2657
Alexander Belopolsky40018472011-02-26 01:02:56 +00002658PyObject *
2659PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002660 const char *encoding,
2661 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002662{
2663 PyObject *v;
2664
2665 if (!PyUnicode_Check(unicode)) {
2666 PyErr_BadArgument();
2667 goto onError;
2668 }
2669
2670 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002672
2673 /* Encode via the codec registry */
2674 v = PyCodec_Encode(unicode, encoding, errors);
2675 if (v == NULL)
2676 goto onError;
2677 return v;
2678
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002680 return NULL;
2681}
2682
Victor Stinnerad158722010-10-27 00:25:46 +00002683PyObject *
2684PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002685{
Victor Stinner99b95382011-07-04 14:23:54 +02002686#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002687 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2688 PyUnicode_GET_SIZE(unicode),
2689 NULL);
2690#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002692#else
Victor Stinner793b5312011-04-27 00:24:21 +02002693 PyInterpreterState *interp = PyThreadState_GET()->interp;
2694 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2695 cannot use it to encode and decode filenames before it is loaded. Load
2696 the Python codec requires to encode at least its own filename. Use the C
2697 version of the locale codec until the codec registry is initialized and
2698 the Python codec is loaded.
2699
2700 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2701 cannot only rely on it: check also interp->fscodec_initialized for
2702 subinterpreters. */
2703 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002704 return PyUnicode_AsEncodedString(unicode,
2705 Py_FileSystemDefaultEncoding,
2706 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002707 }
2708 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002709 /* locale encoding with surrogateescape */
2710 wchar_t *wchar;
2711 char *bytes;
2712 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002713 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002714
2715 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2716 if (wchar == NULL)
2717 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002718 bytes = _Py_wchar2char(wchar, &error_pos);
2719 if (bytes == NULL) {
2720 if (error_pos != (size_t)-1) {
2721 char *errmsg = strerror(errno);
2722 PyObject *exc = NULL;
2723 if (errmsg == NULL)
2724 errmsg = "Py_wchar2char() failed";
2725 raise_encode_exception(&exc,
2726 "filesystemencoding",
2727 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2728 error_pos, error_pos+1,
2729 errmsg);
2730 Py_XDECREF(exc);
2731 }
2732 else
2733 PyErr_NoMemory();
2734 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002735 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002736 }
2737 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002738
2739 bytes_obj = PyBytes_FromString(bytes);
2740 PyMem_Free(bytes);
2741 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002742 }
Victor Stinnerad158722010-10-27 00:25:46 +00002743#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002744}
2745
Alexander Belopolsky40018472011-02-26 01:02:56 +00002746PyObject *
2747PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002748 const char *encoding,
2749 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750{
2751 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002752 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002753
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754 if (!PyUnicode_Check(unicode)) {
2755 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002756 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 }
Fred Drakee4315f52000-05-09 19:53:39 +00002758
Victor Stinner2f283c22011-03-02 01:21:46 +00002759 if (encoding == NULL) {
2760 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002762 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002764 }
Fred Drakee4315f52000-05-09 19:53:39 +00002765
2766 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002767 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002768 if ((strcmp(lower, "utf-8") == 0) ||
2769 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002770 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002771 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002773 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002775 }
Victor Stinner37296e82010-06-10 13:36:23 +00002776 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002777 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002778 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002780#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002781 else if (strcmp(lower, "mbcs") == 0)
2782 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2783 PyUnicode_GET_SIZE(unicode),
2784 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002785#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002786 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789
2790 /* Encode via the codec registry */
2791 v = PyCodec_Encode(unicode, encoding, errors);
2792 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002793 return NULL;
2794
2795 /* The normal path */
2796 if (PyBytes_Check(v))
2797 return v;
2798
2799 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002800 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002801 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002802 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002803
2804 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2805 "encoder %s returned bytearray instead of bytes",
2806 encoding);
2807 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002808 Py_DECREF(v);
2809 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002810 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002811
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002812 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2813 Py_DECREF(v);
2814 return b;
2815 }
2816
2817 PyErr_Format(PyExc_TypeError,
2818 "encoder did not return a bytes object (type=%.400s)",
2819 Py_TYPE(v)->tp_name);
2820 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002821 return NULL;
2822}
2823
Alexander Belopolsky40018472011-02-26 01:02:56 +00002824PyObject *
2825PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002826 const char *encoding,
2827 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002828{
2829 PyObject *v;
2830
2831 if (!PyUnicode_Check(unicode)) {
2832 PyErr_BadArgument();
2833 goto onError;
2834 }
2835
2836 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002838
2839 /* Encode via the codec registry */
2840 v = PyCodec_Encode(unicode, encoding, errors);
2841 if (v == NULL)
2842 goto onError;
2843 if (!PyUnicode_Check(v)) {
2844 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002845 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002846 Py_TYPE(v)->tp_name);
2847 Py_DECREF(v);
2848 goto onError;
2849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 return NULL;
2854}
2855
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002856PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002857PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002858 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002859 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2860}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002861
Christian Heimes5894ba72007-11-04 11:43:14 +00002862PyObject*
2863PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2864{
Victor Stinner99b95382011-07-04 14:23:54 +02002865#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002866 return PyUnicode_DecodeMBCS(s, size, NULL);
2867#elif defined(__APPLE__)
2868 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2869#else
Victor Stinner793b5312011-04-27 00:24:21 +02002870 PyInterpreterState *interp = PyThreadState_GET()->interp;
2871 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2872 cannot use it to encode and decode filenames before it is loaded. Load
2873 the Python codec requires to encode at least its own filename. Use the C
2874 version of the locale codec until the codec registry is initialized and
2875 the Python codec is loaded.
2876
2877 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2878 cannot only rely on it: check also interp->fscodec_initialized for
2879 subinterpreters. */
2880 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002881 return PyUnicode_Decode(s, size,
2882 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002883 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002884 }
2885 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002886 /* locale encoding with surrogateescape */
2887 wchar_t *wchar;
2888 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002889 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002890
2891 if (s[size] != '\0' || size != strlen(s)) {
2892 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2893 return NULL;
2894 }
2895
Victor Stinner168e1172010-10-16 23:16:16 +00002896 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002897 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002898 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002899
Victor Stinner168e1172010-10-16 23:16:16 +00002900 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002901 PyMem_Free(wchar);
2902 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002903 }
Victor Stinnerad158722010-10-27 00:25:46 +00002904#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002905}
2906
Martin v. Löwis011e8422009-05-05 04:43:17 +00002907
2908int
2909PyUnicode_FSConverter(PyObject* arg, void* addr)
2910{
2911 PyObject *output = NULL;
2912 Py_ssize_t size;
2913 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002914 if (arg == NULL) {
2915 Py_DECREF(*(PyObject**)addr);
2916 return 1;
2917 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002918 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002919 output = arg;
2920 Py_INCREF(output);
2921 }
2922 else {
2923 arg = PyUnicode_FromObject(arg);
2924 if (!arg)
2925 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002926 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002927 Py_DECREF(arg);
2928 if (!output)
2929 return 0;
2930 if (!PyBytes_Check(output)) {
2931 Py_DECREF(output);
2932 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2933 return 0;
2934 }
2935 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002936 size = PyBytes_GET_SIZE(output);
2937 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002938 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002939 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002940 Py_DECREF(output);
2941 return 0;
2942 }
2943 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002944 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002945}
2946
2947
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002948int
2949PyUnicode_FSDecoder(PyObject* arg, void* addr)
2950{
2951 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002952 if (arg == NULL) {
2953 Py_DECREF(*(PyObject**)addr);
2954 return 1;
2955 }
2956 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 if (PyUnicode_READY(arg))
2958 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002959 output = arg;
2960 Py_INCREF(output);
2961 }
2962 else {
2963 arg = PyBytes_FromObject(arg);
2964 if (!arg)
2965 return 0;
2966 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2967 PyBytes_GET_SIZE(arg));
2968 Py_DECREF(arg);
2969 if (!output)
2970 return 0;
2971 if (!PyUnicode_Check(output)) {
2972 Py_DECREF(output);
2973 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2974 return 0;
2975 }
2976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2978 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002979 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2980 Py_DECREF(output);
2981 return 0;
2982 }
2983 *(PyObject**)addr = output;
2984 return Py_CLEANUP_SUPPORTED;
2985}
2986
2987
Martin v. Löwis5b222132007-06-10 09:51:05 +00002988char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002989PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002990{
Christian Heimesf3863112007-11-22 07:46:41 +00002991 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002992 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2993
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002994 if (!PyUnicode_Check(unicode)) {
2995 PyErr_BadArgument();
2996 return NULL;
2997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002998 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002999 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003000
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003001 if (PyUnicode_UTF8(unicode) == NULL) {
3002 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3004 if (bytes == NULL)
3005 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003006 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3007 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003008 Py_DECREF(bytes);
3009 return NULL;
3010 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003011 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3012 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003013 Py_DECREF(bytes);
3014 }
3015
3016 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003017 *psize = PyUnicode_UTF8_LENGTH(unicode);
3018 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003019}
3020
3021char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003022PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003024 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3025}
3026
3027#ifdef Py_DEBUG
3028int unicode_as_unicode_calls = 0;
3029#endif
3030
3031
3032Py_UNICODE *
3033PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3034{
3035 PyUnicodeObject *u;
3036 const unsigned char *one_byte;
3037#if SIZEOF_WCHAR_T == 4
3038 const Py_UCS2 *two_bytes;
3039#else
3040 const Py_UCS4 *four_bytes;
3041 const Py_UCS4 *ucs4_end;
3042 Py_ssize_t num_surrogates;
3043#endif
3044 wchar_t *w;
3045 wchar_t *wchar_end;
3046
3047 if (!PyUnicode_Check(unicode)) {
3048 PyErr_BadArgument();
3049 return NULL;
3050 }
3051 u = (PyUnicodeObject*)unicode;
3052 if (_PyUnicode_WSTR(u) == NULL) {
3053 /* Non-ASCII compact unicode object */
3054 assert(_PyUnicode_KIND(u) != 0);
3055 assert(PyUnicode_IS_READY(u));
3056
3057#ifdef Py_DEBUG
3058 ++unicode_as_unicode_calls;
3059#endif
3060
3061 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3062#if SIZEOF_WCHAR_T == 2
3063 four_bytes = PyUnicode_4BYTE_DATA(u);
3064 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3065 num_surrogates = 0;
3066
3067 for (; four_bytes < ucs4_end; ++four_bytes) {
3068 if (*four_bytes > 0xFFFF)
3069 ++num_surrogates;
3070 }
3071
3072 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3073 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3074 if (!_PyUnicode_WSTR(u)) {
3075 PyErr_NoMemory();
3076 return NULL;
3077 }
3078 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3079
3080 w = _PyUnicode_WSTR(u);
3081 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3082 four_bytes = PyUnicode_4BYTE_DATA(u);
3083 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3084 if (*four_bytes > 0xFFFF) {
3085 /* encode surrogate pair in this case */
3086 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3087 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3088 }
3089 else
3090 *w = *four_bytes;
3091
3092 if (w > wchar_end) {
3093 assert(0 && "Miscalculated string end");
3094 }
3095 }
3096 *w = 0;
3097#else
3098 /* sizeof(wchar_t) == 4 */
3099 Py_FatalError("Impossible unicode object state, wstr and str "
3100 "should share memory already.");
3101 return NULL;
3102#endif
3103 }
3104 else {
3105 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3106 (_PyUnicode_LENGTH(u) + 1));
3107 if (!_PyUnicode_WSTR(u)) {
3108 PyErr_NoMemory();
3109 return NULL;
3110 }
3111 if (!PyUnicode_IS_COMPACT_ASCII(u))
3112 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3113 w = _PyUnicode_WSTR(u);
3114 wchar_end = w + _PyUnicode_LENGTH(u);
3115
3116 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3117 one_byte = PyUnicode_1BYTE_DATA(u);
3118 for (; w < wchar_end; ++one_byte, ++w)
3119 *w = *one_byte;
3120 /* null-terminate the wstr */
3121 *w = 0;
3122 }
3123 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3124#if SIZEOF_WCHAR_T == 4
3125 two_bytes = PyUnicode_2BYTE_DATA(u);
3126 for (; w < wchar_end; ++two_bytes, ++w)
3127 *w = *two_bytes;
3128 /* null-terminate the wstr */
3129 *w = 0;
3130#else
3131 /* sizeof(wchar_t) == 2 */
3132 PyObject_FREE(_PyUnicode_WSTR(u));
3133 _PyUnicode_WSTR(u) = NULL;
3134 Py_FatalError("Impossible unicode object state, wstr "
3135 "and str should share memory already.");
3136 return NULL;
3137#endif
3138 }
3139 else {
3140 assert(0 && "This should never happen.");
3141 }
3142 }
3143 }
3144 if (size != NULL)
3145 *size = PyUnicode_WSTR_LENGTH(u);
3146 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003147}
3148
Alexander Belopolsky40018472011-02-26 01:02:56 +00003149Py_UNICODE *
3150PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153}
3154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156Py_ssize_t
3157PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158{
3159 if (!PyUnicode_Check(unicode)) {
3160 PyErr_BadArgument();
3161 goto onError;
3162 }
3163 return PyUnicode_GET_SIZE(unicode);
3164
Benjamin Peterson29060642009-01-31 22:14:21 +00003165 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 return -1;
3167}
3168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003169Py_ssize_t
3170PyUnicode_GetLength(PyObject *unicode)
3171{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003172 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 PyErr_BadArgument();
3174 return -1;
3175 }
3176
3177 return PyUnicode_GET_LENGTH(unicode);
3178}
3179
3180Py_UCS4
3181PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3182{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003183 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3184 PyErr_BadArgument();
3185 return (Py_UCS4)-1;
3186 }
3187 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3188 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003189 return (Py_UCS4)-1;
3190 }
3191 return PyUnicode_READ_CHAR(unicode, index);
3192}
3193
3194int
3195PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3196{
3197 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003198 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003199 return -1;
3200 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003201 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3202 PyErr_SetString(PyExc_IndexError, "string index out of range");
3203 return -1;
3204 }
3205 if (_PyUnicode_Dirty(unicode))
3206 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003207 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3208 index, ch);
3209 return 0;
3210}
3211
Alexander Belopolsky40018472011-02-26 01:02:56 +00003212const char *
3213PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003214{
Victor Stinner42cb4622010-09-01 19:39:01 +00003215 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003216}
3217
Victor Stinner554f3f02010-06-16 23:33:54 +00003218/* create or adjust a UnicodeDecodeError */
3219static void
3220make_decode_exception(PyObject **exceptionObject,
3221 const char *encoding,
3222 const char *input, Py_ssize_t length,
3223 Py_ssize_t startpos, Py_ssize_t endpos,
3224 const char *reason)
3225{
3226 if (*exceptionObject == NULL) {
3227 *exceptionObject = PyUnicodeDecodeError_Create(
3228 encoding, input, length, startpos, endpos, reason);
3229 }
3230 else {
3231 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3232 goto onError;
3233 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3234 goto onError;
3235 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3236 goto onError;
3237 }
3238 return;
3239
3240onError:
3241 Py_DECREF(*exceptionObject);
3242 *exceptionObject = NULL;
3243}
3244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245/* error handling callback helper:
3246 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003247 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 and adjust various state variables.
3249 return 0 on success, -1 on error
3250*/
3251
Alexander Belopolsky40018472011-02-26 01:02:56 +00003252static int
3253unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003254 const char *encoding, const char *reason,
3255 const char **input, const char **inend, Py_ssize_t *startinpos,
3256 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3257 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003259 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260
3261 PyObject *restuple = NULL;
3262 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003263 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003264 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003265 Py_ssize_t requiredsize;
3266 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003267 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003268 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003269 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270 int res = -1;
3271
3272 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 *errorHandler = PyCodec_LookupError(errors);
3274 if (*errorHandler == NULL)
3275 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 }
3277
Victor Stinner554f3f02010-06-16 23:33:54 +00003278 make_decode_exception(exceptionObject,
3279 encoding,
3280 *input, *inend - *input,
3281 *startinpos, *endinpos,
3282 reason);
3283 if (*exceptionObject == NULL)
3284 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003285
3286 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3287 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003290 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292 }
3293 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003295
3296 /* Copy back the bytes variables, which might have been modified by the
3297 callback */
3298 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3299 if (!inputobj)
3300 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003301 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003303 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003304 *input = PyBytes_AS_STRING(inputobj);
3305 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003306 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003307 /* we can DECREF safely, as the exception has another reference,
3308 so the object won't go away. */
3309 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003310
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003311 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003312 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003313 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003314 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3315 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317
3318 /* need more space? (at least enough for what we
3319 have+the replacement+the rest of the string (starting
3320 at the new input position), so we won't have to check space
3321 when there are no errors in the rest of the string) */
3322 repptr = PyUnicode_AS_UNICODE(repunicode);
3323 repsize = PyUnicode_GET_SIZE(repunicode);
3324 requiredsize = *outpos + repsize + insize-newpos;
3325 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 if (requiredsize<2*outsize)
3327 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003328 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003329 goto onError;
3330 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003331 }
3332 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003333 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003334 Py_UNICODE_COPY(*outptr, repptr, repsize);
3335 *outptr += repsize;
3336 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 /* we made it! */
3339 res = 0;
3340
Benjamin Peterson29060642009-01-31 22:14:21 +00003341 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 Py_XDECREF(restuple);
3343 return res;
3344}
3345
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003346/* --- UTF-7 Codec -------------------------------------------------------- */
3347
Antoine Pitrou244651a2009-05-04 18:56:13 +00003348/* See RFC2152 for details. We encode conservatively and decode liberally. */
3349
3350/* Three simple macros defining base-64. */
3351
3352/* Is c a base-64 character? */
3353
3354#define IS_BASE64(c) \
3355 (((c) >= 'A' && (c) <= 'Z') || \
3356 ((c) >= 'a' && (c) <= 'z') || \
3357 ((c) >= '0' && (c) <= '9') || \
3358 (c) == '+' || (c) == '/')
3359
3360/* given that c is a base-64 character, what is its base-64 value? */
3361
3362#define FROM_BASE64(c) \
3363 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3364 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3365 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3366 (c) == '+' ? 62 : 63)
3367
3368/* What is the base-64 character of the bottom 6 bits of n? */
3369
3370#define TO_BASE64(n) \
3371 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3372
3373/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3374 * decoded as itself. We are permissive on decoding; the only ASCII
3375 * byte not decoding to itself is the + which begins a base64
3376 * string. */
3377
3378#define DECODE_DIRECT(c) \
3379 ((c) <= 127 && (c) != '+')
3380
3381/* The UTF-7 encoder treats ASCII characters differently according to
3382 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3383 * the above). See RFC2152. This array identifies these different
3384 * sets:
3385 * 0 : "Set D"
3386 * alphanumeric and '(),-./:?
3387 * 1 : "Set O"
3388 * !"#$%&*;<=>@[]^_`{|}
3389 * 2 : "whitespace"
3390 * ht nl cr sp
3391 * 3 : special (must be base64 encoded)
3392 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3393 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003394
Tim Petersced69f82003-09-16 20:30:58 +00003395static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003396char utf7_category[128] = {
3397/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3398 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3399/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3400 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3401/* sp ! " # $ % & ' ( ) * + , - . / */
3402 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3403/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3404 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3405/* @ A B C D E F G H I J K L M N O */
3406 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3407/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3408 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3409/* ` a b c d e f g h i j k l m n o */
3410 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3411/* p q r s t u v w x y z { | } ~ del */
3412 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003413};
3414
Antoine Pitrou244651a2009-05-04 18:56:13 +00003415/* ENCODE_DIRECT: this character should be encoded as itself. The
3416 * answer depends on whether we are encoding set O as itself, and also
3417 * on whether we are encoding whitespace as itself. RFC2152 makes it
3418 * clear that the answers to these questions vary between
3419 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003420
Antoine Pitrou244651a2009-05-04 18:56:13 +00003421#define ENCODE_DIRECT(c, directO, directWS) \
3422 ((c) < 128 && (c) > 0 && \
3423 ((utf7_category[(c)] == 0) || \
3424 (directWS && (utf7_category[(c)] == 2)) || \
3425 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003426
Alexander Belopolsky40018472011-02-26 01:02:56 +00003427PyObject *
3428PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003429 Py_ssize_t size,
3430 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003431{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003432 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3433}
3434
Antoine Pitrou244651a2009-05-04 18:56:13 +00003435/* The decoder. The only state we preserve is our read position,
3436 * i.e. how many characters we have consumed. So if we end in the
3437 * middle of a shift sequence we have to back off the read position
3438 * and the output to the beginning of the sequence, otherwise we lose
3439 * all the shift state (seen bits, number of bits seen, high
3440 * surrogate). */
3441
Alexander Belopolsky40018472011-02-26 01:02:56 +00003442PyObject *
3443PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003444 Py_ssize_t size,
3445 const char *errors,
3446 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003447{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003448 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003449 Py_ssize_t startinpos;
3450 Py_ssize_t endinpos;
3451 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003452 const char *e;
3453 PyUnicodeObject *unicode;
3454 Py_UNICODE *p;
3455 const char *errmsg = "";
3456 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003457 Py_UNICODE *shiftOutStart;
3458 unsigned int base64bits = 0;
3459 unsigned long base64buffer = 0;
3460 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003461 PyObject *errorHandler = NULL;
3462 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003463
3464 unicode = _PyUnicode_New(size);
3465 if (!unicode)
3466 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003467 if (size == 0) {
3468 if (consumed)
3469 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003470 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003473 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003474 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003475 e = s + size;
3476
3477 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003480 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003481
Antoine Pitrou244651a2009-05-04 18:56:13 +00003482 if (inShift) { /* in a base-64 section */
3483 if (IS_BASE64(ch)) { /* consume a base-64 character */
3484 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3485 base64bits += 6;
3486 s++;
3487 if (base64bits >= 16) {
3488 /* we have enough bits for a UTF-16 value */
3489 Py_UNICODE outCh = (Py_UNICODE)
3490 (base64buffer >> (base64bits-16));
3491 base64bits -= 16;
3492 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3493 if (surrogate) {
3494 /* expecting a second surrogate */
3495 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3496#ifdef Py_UNICODE_WIDE
3497 *p++ = (((surrogate & 0x3FF)<<10)
3498 | (outCh & 0x3FF)) + 0x10000;
3499#else
3500 *p++ = surrogate;
3501 *p++ = outCh;
3502#endif
3503 surrogate = 0;
3504 }
3505 else {
3506 surrogate = 0;
3507 errmsg = "second surrogate missing";
3508 goto utf7Error;
3509 }
3510 }
3511 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3512 /* first surrogate */
3513 surrogate = outCh;
3514 }
3515 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3516 errmsg = "unexpected second surrogate";
3517 goto utf7Error;
3518 }
3519 else {
3520 *p++ = outCh;
3521 }
3522 }
3523 }
3524 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003525 inShift = 0;
3526 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003527 if (surrogate) {
3528 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003529 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003530 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003531 if (base64bits > 0) { /* left-over bits */
3532 if (base64bits >= 6) {
3533 /* We've seen at least one base-64 character */
3534 errmsg = "partial character in shift sequence";
3535 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003536 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003537 else {
3538 /* Some bits remain; they should be zero */
3539 if (base64buffer != 0) {
3540 errmsg = "non-zero padding bits in shift sequence";
3541 goto utf7Error;
3542 }
3543 }
3544 }
3545 if (ch != '-') {
3546 /* '-' is absorbed; other terminating
3547 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003548 *p++ = ch;
3549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003550 }
3551 }
3552 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003553 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003554 s++; /* consume '+' */
3555 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003556 s++;
3557 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003558 }
3559 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003560 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003561 shiftOutStart = p;
3562 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003563 }
3564 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003565 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003566 *p++ = ch;
3567 s++;
3568 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003569 else {
3570 startinpos = s-starts;
3571 s++;
3572 errmsg = "unexpected special character";
3573 goto utf7Error;
3574 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003575 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003576utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 outpos = p-PyUnicode_AS_UNICODE(unicode);
3578 endinpos = s-starts;
3579 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003580 errors, &errorHandler,
3581 "utf7", errmsg,
3582 &starts, &e, &startinpos, &endinpos, &exc, &s,
3583 &unicode, &outpos, &p))
3584 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 }
3586
Antoine Pitrou244651a2009-05-04 18:56:13 +00003587 /* end of string */
3588
3589 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3590 /* if we're in an inconsistent state, that's an error */
3591 if (surrogate ||
3592 (base64bits >= 6) ||
3593 (base64bits > 0 && base64buffer != 0)) {
3594 outpos = p-PyUnicode_AS_UNICODE(unicode);
3595 endinpos = size;
3596 if (unicode_decode_call_errorhandler(
3597 errors, &errorHandler,
3598 "utf7", "unterminated shift sequence",
3599 &starts, &e, &startinpos, &endinpos, &exc, &s,
3600 &unicode, &outpos, &p))
3601 goto onError;
3602 if (s < e)
3603 goto restart;
3604 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003605 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003606
3607 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003608 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003609 if (inShift) {
3610 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003611 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003612 }
3613 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003614 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003616 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003617
Victor Stinnerfe226c02011-10-03 03:52:20 +02003618 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003619 goto onError;
3620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 Py_XDECREF(errorHandler);
3622 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003623 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003624 Py_DECREF(unicode);
3625 return NULL;
3626 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003627 return (PyObject *)unicode;
3628
Benjamin Peterson29060642009-01-31 22:14:21 +00003629 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630 Py_XDECREF(errorHandler);
3631 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003632 Py_DECREF(unicode);
3633 return NULL;
3634}
3635
3636
Alexander Belopolsky40018472011-02-26 01:02:56 +00003637PyObject *
3638PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003639 Py_ssize_t size,
3640 int base64SetO,
3641 int base64WhiteSpace,
3642 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003643{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003644 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003645 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003646 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003647 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003648 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003649 unsigned int base64bits = 0;
3650 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003651 char * out;
3652 char * start;
3653
3654 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003656
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003657 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003658 return PyErr_NoMemory();
3659
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003661 if (v == NULL)
3662 return NULL;
3663
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003664 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003665 for (;i < size; ++i) {
3666 Py_UNICODE ch = s[i];
3667
Antoine Pitrou244651a2009-05-04 18:56:13 +00003668 if (inShift) {
3669 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3670 /* shifting out */
3671 if (base64bits) { /* output remaining bits */
3672 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3673 base64buffer = 0;
3674 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675 }
3676 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003677 /* Characters not in the BASE64 set implicitly unshift the sequence
3678 so no '-' is required, except if the character is itself a '-' */
3679 if (IS_BASE64(ch) || ch == '-') {
3680 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003681 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003682 *out++ = (char) ch;
3683 }
3684 else {
3685 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003687 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003688 else { /* not in a shift sequence */
3689 if (ch == '+') {
3690 *out++ = '+';
3691 *out++ = '-';
3692 }
3693 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3694 *out++ = (char) ch;
3695 }
3696 else {
3697 *out++ = '+';
3698 inShift = 1;
3699 goto encode_char;
3700 }
3701 }
3702 continue;
3703encode_char:
3704#ifdef Py_UNICODE_WIDE
3705 if (ch >= 0x10000) {
3706 /* code first surrogate */
3707 base64bits += 16;
3708 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3709 while (base64bits >= 6) {
3710 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3711 base64bits -= 6;
3712 }
3713 /* prepare second surrogate */
3714 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3715 }
3716#endif
3717 base64bits += 16;
3718 base64buffer = (base64buffer << 16) | ch;
3719 while (base64bits >= 6) {
3720 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3721 base64bits -= 6;
3722 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003723 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003724 if (base64bits)
3725 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3726 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003727 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003728 if (_PyBytes_Resize(&v, out - start) < 0)
3729 return NULL;
3730 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003731}
3732
Antoine Pitrou244651a2009-05-04 18:56:13 +00003733#undef IS_BASE64
3734#undef FROM_BASE64
3735#undef TO_BASE64
3736#undef DECODE_DIRECT
3737#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739/* --- UTF-8 Codec -------------------------------------------------------- */
3740
Tim Petersced69f82003-09-16 20:30:58 +00003741static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003743 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3744 illegal prefix. See RFC 3629 for details */
3745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3757 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3758 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3759 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3760 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761};
3762
Alexander Belopolsky40018472011-02-26 01:02:56 +00003763PyObject *
3764PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003765 Py_ssize_t size,
3766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767{
Walter Dörwald69652032004-09-07 20:24:22 +00003768 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3769}
3770
Antoine Pitrouab868312009-01-10 15:40:25 +00003771/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3772#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3773
3774/* Mask to quickly check whether a C 'long' contains a
3775 non-ASCII, UTF8-encoded char. */
3776#if (SIZEOF_LONG == 8)
3777# define ASCII_CHAR_MASK 0x8080808080808080L
3778#elif (SIZEOF_LONG == 4)
3779# define ASCII_CHAR_MASK 0x80808080L
3780#else
3781# error C 'long' size should be either 4 or 8!
3782#endif
3783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784/* Scans a UTF-8 string and returns the maximum character to be expected,
3785 the size of the decoded unicode string and if any major errors were
3786 encountered.
3787
3788 This function does check basic UTF-8 sanity, it does however NOT CHECK
3789 if the string contains surrogates, and if all continuation bytes are
3790 within the correct ranges, these checks are performed in
3791 PyUnicode_DecodeUTF8Stateful.
3792
3793 If it sets has_errors to 1, it means the value of unicode_size and max_char
3794 will be bogus and you should not rely on useful information in them.
3795 */
3796static Py_UCS4
3797utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3798 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3799 int *has_errors)
3800{
3801 Py_ssize_t n;
3802 Py_ssize_t char_count = 0;
3803 Py_UCS4 max_char = 127, new_max;
3804 Py_UCS4 upper_bound;
3805 const unsigned char *p = (const unsigned char *)s;
3806 const unsigned char *end = p + string_size;
3807 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3808 int err = 0;
3809
3810 for (; p < end && !err; ++p, ++char_count) {
3811 /* Only check value if it's not a ASCII char... */
3812 if (*p < 0x80) {
3813 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3814 an explanation. */
3815 if (!((size_t) p & LONG_PTR_MASK)) {
3816 /* Help register allocation */
3817 register const unsigned char *_p = p;
3818 while (_p < aligned_end) {
3819 unsigned long value = *(unsigned long *) _p;
3820 if (value & ASCII_CHAR_MASK)
3821 break;
3822 _p += SIZEOF_LONG;
3823 char_count += SIZEOF_LONG;
3824 }
3825 p = _p;
3826 if (p == end)
3827 break;
3828 }
3829 }
3830 if (*p >= 0x80) {
3831 n = utf8_code_length[*p];
3832 new_max = max_char;
3833 switch (n) {
3834 /* invalid start byte */
3835 case 0:
3836 err = 1;
3837 break;
3838 case 2:
3839 /* Code points between 0x00FF and 0x07FF inclusive.
3840 Approximate the upper bound of the code point,
3841 if this flips over 255 we can be sure it will be more
3842 than 255 and the string will need 2 bytes per code coint,
3843 if it stays under or equal to 255, we can be sure 1 byte
3844 is enough.
3845 ((*p & 0b00011111) << 6) | 0b00111111 */
3846 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3847 if (max_char < upper_bound)
3848 new_max = upper_bound;
3849 /* Ensure we track at least that we left ASCII space. */
3850 if (new_max < 128)
3851 new_max = 128;
3852 break;
3853 case 3:
3854 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3855 always > 255 and <= 65535 and will always need 2 bytes. */
3856 if (max_char < 65535)
3857 new_max = 65535;
3858 break;
3859 case 4:
3860 /* Code point will be above 0xFFFF for sure in this case. */
3861 new_max = 65537;
3862 break;
3863 /* Internal error, this should be caught by the first if */
3864 case 1:
3865 default:
3866 assert(0 && "Impossible case in utf8_max_char_and_size");
3867 err = 1;
3868 }
3869 /* Instead of number of overall bytes for this code point,
3870 n containts the number of following bytes: */
3871 --n;
3872 /* Check if the follow up chars are all valid continuation bytes */
3873 if (n >= 1) {
3874 const unsigned char *cont;
3875 if ((p + n) >= end) {
3876 if (consumed == 0)
3877 /* incomplete data, non-incremental decoding */
3878 err = 1;
3879 break;
3880 }
3881 for (cont = p + 1; cont < (p + n); ++cont) {
3882 if ((*cont & 0xc0) != 0x80) {
3883 err = 1;
3884 break;
3885 }
3886 }
3887 p += n;
3888 }
3889 else
3890 err = 1;
3891 max_char = new_max;
3892 }
3893 }
3894
3895 if (unicode_size)
3896 *unicode_size = char_count;
3897 if (has_errors)
3898 *has_errors = err;
3899 return max_char;
3900}
3901
3902/* Similar to PyUnicode_WRITE but can also write into wstr field
3903 of the legacy unicode representation */
3904#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3905 do { \
3906 const int k_ = (kind); \
3907 if (k_ == PyUnicode_WCHAR_KIND) \
3908 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3909 else if (k_ == PyUnicode_1BYTE_KIND) \
3910 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3911 else if (k_ == PyUnicode_2BYTE_KIND) \
3912 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3913 else \
3914 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3915 } while (0)
3916
Alexander Belopolsky40018472011-02-26 01:02:56 +00003917PyObject *
3918PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 Py_ssize_t size,
3920 const char *errors,
3921 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003922{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003925 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003926 Py_ssize_t startinpos;
3927 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003928 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003930 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 PyObject *errorHandler = NULL;
3932 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 Py_UCS4 maxchar = 0;
3934 Py_ssize_t unicode_size;
3935 Py_ssize_t i;
3936 int kind;
3937 void *data;
3938 int has_errors;
3939 Py_UNICODE *error_outptr;
3940#if SIZEOF_WCHAR_T == 2
3941 Py_ssize_t wchar_offset = 0;
3942#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943
Walter Dörwald69652032004-09-07 20:24:22 +00003944 if (size == 0) {
3945 if (consumed)
3946 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3950 consumed, &has_errors);
3951 if (has_errors) {
3952 unicode = _PyUnicode_New(size);
3953 if (!unicode)
3954 return NULL;
3955 kind = PyUnicode_WCHAR_KIND;
3956 data = PyUnicode_AS_UNICODE(unicode);
3957 assert(data != NULL);
3958 }
3959 else {
3960 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3961 if (!unicode)
3962 return NULL;
3963 /* When the string is ASCII only, just use memcpy and return.
3964 unicode_size may be != size if there is an incomplete UTF-8
3965 sequence at the end of the ASCII block. */
3966 if (maxchar < 128 && size == unicode_size) {
3967 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3968 return (PyObject *)unicode;
3969 }
3970 kind = PyUnicode_KIND(unicode);
3971 data = PyUnicode_DATA(unicode);
3972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003973 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003976 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003977
3978 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003979 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980
3981 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003982 /* Fast path for runs of ASCII characters. Given that common UTF-8
3983 input will consist of an overwhelming majority of ASCII
3984 characters, we try to optimize for this case by checking
3985 as many characters as a C 'long' can contain.
3986 First, check if we can do an aligned read, as most CPUs have
3987 a penalty for unaligned reads.
3988 */
3989 if (!((size_t) s & LONG_PTR_MASK)) {
3990 /* Help register allocation */
3991 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003993 while (_s < aligned_end) {
3994 /* Read a whole long at a time (either 4 or 8 bytes),
3995 and do a fast unrolled copy if it only contains ASCII
3996 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997 unsigned long value = *(unsigned long *) _s;
3998 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003999 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4001 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4002 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4003 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004004#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4006 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4007 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4008 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004009#endif
4010 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004012 }
4013 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004015 if (s == e)
4016 break;
4017 ch = (unsigned char)*s;
4018 }
4019 }
4020
4021 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 s++;
4024 continue;
4025 }
4026
4027 n = utf8_code_length[ch];
4028
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004029 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004030 if (consumed)
4031 break;
4032 else {
4033 errmsg = "unexpected end of data";
4034 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004035 endinpos = startinpos+1;
4036 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4037 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 goto utf8Error;
4039 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004040 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041
4042 switch (n) {
4043
4044 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004045 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 startinpos = s-starts;
4047 endinpos = startinpos+1;
4048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049
4050 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004051 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004052 startinpos = s-starts;
4053 endinpos = startinpos+1;
4054 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055
4056 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004057 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004058 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004060 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 goto utf8Error;
4062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004064 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 break;
4067
4068 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004069 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4070 will result in surrogates in range d800-dfff. Surrogates are
4071 not valid UTF-8 so they are rejected.
4072 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4073 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004074 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004075 (s[2] & 0xc0) != 0x80 ||
4076 ((unsigned char)s[0] == 0xE0 &&
4077 (unsigned char)s[1] < 0xA0) ||
4078 ((unsigned char)s[0] == 0xED &&
4079 (unsigned char)s[1] > 0x9F)) {
4080 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004082 endinpos = startinpos + 1;
4083
4084 /* if s[1] first two bits are 1 and 0, then the invalid
4085 continuation byte is s[2], so increment endinpos by 1,
4086 if not, s[1] is invalid and endinpos doesn't need to
4087 be incremented. */
4088 if ((s[1] & 0xC0) == 0x80)
4089 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 goto utf8Error;
4091 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004093 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004095 break;
4096
4097 case 4:
4098 if ((s[1] & 0xc0) != 0x80 ||
4099 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004100 (s[3] & 0xc0) != 0x80 ||
4101 ((unsigned char)s[0] == 0xF0 &&
4102 (unsigned char)s[1] < 0x90) ||
4103 ((unsigned char)s[0] == 0xF4 &&
4104 (unsigned char)s[1] > 0x8F)) {
4105 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004107 endinpos = startinpos + 1;
4108 if ((s[1] & 0xC0) == 0x80) {
4109 endinpos++;
4110 if ((s[2] & 0xC0) == 0x80)
4111 endinpos++;
4112 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 goto utf8Error;
4114 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004115 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004116 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4117 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 /* If the string is flexible or we have native UCS-4, write
4120 directly.. */
4121 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4122 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004124 else {
4125 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004127 /* translate from 10000..10FFFF to 0..FFFF */
4128 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004130 /* high surrogate = top 10 bits added to D800 */
4131 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4132 (Py_UNICODE)(0xD800 + (ch >> 10)));
4133
4134 /* low surrogate = bottom 10 bits added to DC00 */
4135 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4136 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4137 }
4138#if SIZEOF_WCHAR_T == 2
4139 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004140#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 }
4143 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004145
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 /* If this is not yet a resizable string, make it one.. */
4148 if (kind != PyUnicode_WCHAR_KIND) {
4149 const Py_UNICODE *u;
4150 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4151 if (!new_unicode)
4152 goto onError;
4153 u = PyUnicode_AsUnicode((PyObject *)unicode);
4154 if (!u)
4155 goto onError;
4156#if SIZEOF_WCHAR_T == 2
4157 i += wchar_offset;
4158#endif
4159 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4160 Py_DECREF(unicode);
4161 unicode = new_unicode;
4162 kind = 0;
4163 data = PyUnicode_AS_UNICODE(new_unicode);
4164 assert(data != NULL);
4165 }
4166 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 if (unicode_decode_call_errorhandler(
4168 errors, &errorHandler,
4169 "utf8", errmsg,
4170 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004173 /* Update data because unicode_decode_call_errorhandler might have
4174 re-created or resized the unicode object. */
4175 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 /* Ensure the unicode_size calculation above was correct: */
4179 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4180
Walter Dörwald69652032004-09-07 20:24:22 +00004181 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184 /* Adjust length and ready string when it contained errors and
4185 is of the old resizable kind. */
4186 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004187 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004188 goto onError;
4189 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004191 Py_XDECREF(errorHandler);
4192 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004193 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004194 Py_DECREF(unicode);
4195 return NULL;
4196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 return (PyObject *)unicode;
4198
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 Py_XDECREF(errorHandler);
4201 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 Py_DECREF(unicode);
4203 return NULL;
4204}
4205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004207
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004208#ifdef __APPLE__
4209
4210/* Simplified UTF-8 decoder using surrogateescape error handler,
4211 used to decode the command line arguments on Mac OS X. */
4212
4213wchar_t*
4214_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4215{
4216 int n;
4217 const char *e;
4218 wchar_t *unicode, *p;
4219
4220 /* Note: size will always be longer than the resulting Unicode
4221 character count */
4222 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4223 PyErr_NoMemory();
4224 return NULL;
4225 }
4226 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4227 if (!unicode)
4228 return NULL;
4229
4230 /* Unpack UTF-8 encoded data */
4231 p = unicode;
4232 e = s + size;
4233 while (s < e) {
4234 Py_UCS4 ch = (unsigned char)*s;
4235
4236 if (ch < 0x80) {
4237 *p++ = (wchar_t)ch;
4238 s++;
4239 continue;
4240 }
4241
4242 n = utf8_code_length[ch];
4243 if (s + n > e) {
4244 goto surrogateescape;
4245 }
4246
4247 switch (n) {
4248 case 0:
4249 case 1:
4250 goto surrogateescape;
4251
4252 case 2:
4253 if ((s[1] & 0xc0) != 0x80)
4254 goto surrogateescape;
4255 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4256 assert ((ch > 0x007F) && (ch <= 0x07FF));
4257 *p++ = (wchar_t)ch;
4258 break;
4259
4260 case 3:
4261 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4262 will result in surrogates in range d800-dfff. Surrogates are
4263 not valid UTF-8 so they are rejected.
4264 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4265 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4266 if ((s[1] & 0xc0) != 0x80 ||
4267 (s[2] & 0xc0) != 0x80 ||
4268 ((unsigned char)s[0] == 0xE0 &&
4269 (unsigned char)s[1] < 0xA0) ||
4270 ((unsigned char)s[0] == 0xED &&
4271 (unsigned char)s[1] > 0x9F)) {
4272
4273 goto surrogateescape;
4274 }
4275 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4276 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004278 break;
4279
4280 case 4:
4281 if ((s[1] & 0xc0) != 0x80 ||
4282 (s[2] & 0xc0) != 0x80 ||
4283 (s[3] & 0xc0) != 0x80 ||
4284 ((unsigned char)s[0] == 0xF0 &&
4285 (unsigned char)s[1] < 0x90) ||
4286 ((unsigned char)s[0] == 0xF4 &&
4287 (unsigned char)s[1] > 0x8F)) {
4288 goto surrogateescape;
4289 }
4290 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4291 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4292 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4293
4294#if SIZEOF_WCHAR_T == 4
4295 *p++ = (wchar_t)ch;
4296#else
4297 /* compute and append the two surrogates: */
4298
4299 /* translate from 10000..10FFFF to 0..FFFF */
4300 ch -= 0x10000;
4301
4302 /* high surrogate = top 10 bits added to D800 */
4303 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4304
4305 /* low surrogate = bottom 10 bits added to DC00 */
4306 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4307#endif
4308 break;
4309 }
4310 s += n;
4311 continue;
4312
4313 surrogateescape:
4314 *p++ = 0xDC00 + ch;
4315 s++;
4316 }
4317 *p = L'\0';
4318 return unicode;
4319}
4320
4321#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004323/* Primary internal function which creates utf8 encoded bytes objects.
4324
4325 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004326 and allocate exactly as much space needed at the end. Else allocate the
4327 maximum possible needed (4 result bytes per Unicode character), and return
4328 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004329*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004330PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332{
Tim Peters602f7402002-04-27 18:03:26 +00004333#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004334
Guido van Rossum98297ee2007-11-06 21:34:58 +00004335 Py_ssize_t i; /* index into s of next input byte */
4336 PyObject *result; /* result string object */
4337 char *p; /* next free byte in output buffer */
4338 Py_ssize_t nallocated; /* number of result bytes allocated */
4339 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004340 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004341 PyObject *errorHandler = NULL;
4342 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343 int kind;
4344 void *data;
4345 Py_ssize_t size;
4346 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4347#if SIZEOF_WCHAR_T == 2
4348 Py_ssize_t wchar_offset = 0;
4349#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004351 if (!PyUnicode_Check(unicode)) {
4352 PyErr_BadArgument();
4353 return NULL;
4354 }
4355
4356 if (PyUnicode_READY(unicode) == -1)
4357 return NULL;
4358
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004359 if (PyUnicode_UTF8(unicode))
4360 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4361 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362
4363 kind = PyUnicode_KIND(unicode);
4364 data = PyUnicode_DATA(unicode);
4365 size = PyUnicode_GET_LENGTH(unicode);
4366
Tim Peters602f7402002-04-27 18:03:26 +00004367 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368
Tim Peters602f7402002-04-27 18:03:26 +00004369 if (size <= MAX_SHORT_UNICHARS) {
4370 /* Write into the stack buffer; nallocated can't overflow.
4371 * At the end, we'll allocate exactly as much heap space as it
4372 * turns out we need.
4373 */
4374 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004375 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004376 p = stackbuf;
4377 }
4378 else {
4379 /* Overallocate on the heap, and give the excess back at the end. */
4380 nallocated = size * 4;
4381 if (nallocated / 4 != size) /* overflow! */
4382 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004383 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004384 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004385 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004386 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004387 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004388
Tim Peters602f7402002-04-27 18:03:26 +00004389 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004390 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004391
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004392 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004393 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004395
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004397 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004398 *p++ = (char)(0xc0 | (ch >> 6));
4399 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004400 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004401 Py_ssize_t newpos;
4402 PyObject *rep;
4403 Py_ssize_t repsize, k, startpos;
4404 startpos = i-1;
4405#if SIZEOF_WCHAR_T == 2
4406 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004407#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004408 rep = unicode_encode_call_errorhandler(
4409 errors, &errorHandler, "utf-8", "surrogates not allowed",
4410 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4411 &exc, startpos, startpos+1, &newpos);
4412 if (!rep)
4413 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004415 if (PyBytes_Check(rep))
4416 repsize = PyBytes_GET_SIZE(rep);
4417 else
4418 repsize = PyUnicode_GET_SIZE(rep);
4419
4420 if (repsize > 4) {
4421 Py_ssize_t offset;
4422
4423 if (result == NULL)
4424 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004426 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004428 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4429 /* integer overflow */
4430 PyErr_NoMemory();
4431 goto error;
4432 }
4433 nallocated += repsize - 4;
4434 if (result != NULL) {
4435 if (_PyBytes_Resize(&result, nallocated) < 0)
4436 goto error;
4437 } else {
4438 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004439 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 goto error;
4441 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4442 }
4443 p = PyBytes_AS_STRING(result) + offset;
4444 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004446 if (PyBytes_Check(rep)) {
4447 char *prep = PyBytes_AS_STRING(rep);
4448 for(k = repsize; k > 0; k--)
4449 *p++ = *prep++;
4450 } else /* rep is unicode */ {
4451 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4452 Py_UNICODE c;
4453
4454 for(k=0; k<repsize; k++) {
4455 c = prep[k];
4456 if (0x80 <= c) {
4457 raise_encode_exception(&exc, "utf-8",
4458 PyUnicode_AS_UNICODE(unicode),
4459 size, i-1, i,
4460 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004461 goto error;
4462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004464 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004466 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004467 } else if (ch < 0x10000) {
4468 *p++ = (char)(0xe0 | (ch >> 12));
4469 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4470 *p++ = (char)(0x80 | (ch & 0x3f));
4471 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004472 /* Encode UCS4 Unicode ordinals */
4473 *p++ = (char)(0xf0 | (ch >> 18));
4474 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4475 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4476 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477#if SIZEOF_WCHAR_T == 2
4478 wchar_offset++;
4479#endif
Tim Peters602f7402002-04-27 18:03:26 +00004480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004482
Guido van Rossum98297ee2007-11-06 21:34:58 +00004483 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004484 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004485 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004486 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004487 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004488 }
4489 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004490 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004491 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004492 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004493 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004495
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004496 Py_XDECREF(errorHandler);
4497 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004498 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004499 error:
4500 Py_XDECREF(errorHandler);
4501 Py_XDECREF(exc);
4502 Py_XDECREF(result);
4503 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004504
Tim Peters602f7402002-04-27 18:03:26 +00004505#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506}
4507
Alexander Belopolsky40018472011-02-26 01:02:56 +00004508PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4510 Py_ssize_t size,
4511 const char *errors)
4512{
4513 PyObject *v, *unicode;
4514
4515 unicode = PyUnicode_FromUnicode(s, size);
4516 if (unicode == NULL)
4517 return NULL;
4518 v = _PyUnicode_AsUTF8String(unicode, errors);
4519 Py_DECREF(unicode);
4520 return v;
4521}
4522
4523PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004524PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527}
4528
Walter Dörwald41980ca2007-08-16 21:55:45 +00004529/* --- UTF-32 Codec ------------------------------------------------------- */
4530
4531PyObject *
4532PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 Py_ssize_t size,
4534 const char *errors,
4535 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004536{
4537 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4538}
4539
4540PyObject *
4541PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 Py_ssize_t size,
4543 const char *errors,
4544 int *byteorder,
4545 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004546{
4547 const char *starts = s;
4548 Py_ssize_t startinpos;
4549 Py_ssize_t endinpos;
4550 Py_ssize_t outpos;
4551 PyUnicodeObject *unicode;
4552 Py_UNICODE *p;
4553#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004554 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004555 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004556#else
4557 const int pairs = 0;
4558#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004559 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004560 int bo = 0; /* assume native ordering by default */
4561 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004562 /* Offsets from q for retrieving bytes in the right order. */
4563#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4564 int iorder[] = {0, 1, 2, 3};
4565#else
4566 int iorder[] = {3, 2, 1, 0};
4567#endif
4568 PyObject *errorHandler = NULL;
4569 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004570
Walter Dörwald41980ca2007-08-16 21:55:45 +00004571 q = (unsigned char *)s;
4572 e = q + size;
4573
4574 if (byteorder)
4575 bo = *byteorder;
4576
4577 /* Check for BOM marks (U+FEFF) in the input and adjust current
4578 byte order setting accordingly. In native mode, the leading BOM
4579 mark is skipped, in all other modes, it is copied to the output
4580 stream as-is (giving a ZWNBSP character). */
4581 if (bo == 0) {
4582 if (size >= 4) {
4583 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004585#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 if (bom == 0x0000FEFF) {
4587 q += 4;
4588 bo = -1;
4589 }
4590 else if (bom == 0xFFFE0000) {
4591 q += 4;
4592 bo = 1;
4593 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004594#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 if (bom == 0x0000FEFF) {
4596 q += 4;
4597 bo = 1;
4598 }
4599 else if (bom == 0xFFFE0000) {
4600 q += 4;
4601 bo = -1;
4602 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004603#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004605 }
4606
4607 if (bo == -1) {
4608 /* force LE */
4609 iorder[0] = 0;
4610 iorder[1] = 1;
4611 iorder[2] = 2;
4612 iorder[3] = 3;
4613 }
4614 else if (bo == 1) {
4615 /* force BE */
4616 iorder[0] = 3;
4617 iorder[1] = 2;
4618 iorder[2] = 1;
4619 iorder[3] = 0;
4620 }
4621
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004622 /* On narrow builds we split characters outside the BMP into two
4623 codepoints => count how much extra space we need. */
4624#ifndef Py_UNICODE_WIDE
4625 for (qq = q; qq < e; qq += 4)
4626 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4627 pairs++;
4628#endif
4629
4630 /* This might be one to much, because of a BOM */
4631 unicode = _PyUnicode_New((size+3)/4+pairs);
4632 if (!unicode)
4633 return NULL;
4634 if (size == 0)
4635 return (PyObject *)unicode;
4636
4637 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004638 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004639
Walter Dörwald41980ca2007-08-16 21:55:45 +00004640 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 Py_UCS4 ch;
4642 /* remaining bytes at the end? (size should be divisible by 4) */
4643 if (e-q<4) {
4644 if (consumed)
4645 break;
4646 errmsg = "truncated data";
4647 startinpos = ((const char *)q)-starts;
4648 endinpos = ((const char *)e)-starts;
4649 goto utf32Error;
4650 /* The remaining input chars are ignored if the callback
4651 chooses to skip the input */
4652 }
4653 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4654 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004655
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 if (ch >= 0x110000)
4657 {
4658 errmsg = "codepoint not in range(0x110000)";
4659 startinpos = ((const char *)q)-starts;
4660 endinpos = startinpos+4;
4661 goto utf32Error;
4662 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004663#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 if (ch >= 0x10000)
4665 {
4666 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4667 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4668 }
4669 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004670#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 *p++ = ch;
4672 q += 4;
4673 continue;
4674 utf32Error:
4675 outpos = p-PyUnicode_AS_UNICODE(unicode);
4676 if (unicode_decode_call_errorhandler(
4677 errors, &errorHandler,
4678 "utf32", errmsg,
4679 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4680 &unicode, &outpos, &p))
4681 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004682 }
4683
4684 if (byteorder)
4685 *byteorder = bo;
4686
4687 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004689
4690 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004691 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004692 goto onError;
4693
4694 Py_XDECREF(errorHandler);
4695 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004696 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004697 Py_DECREF(unicode);
4698 return NULL;
4699 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004700 return (PyObject *)unicode;
4701
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004703 Py_DECREF(unicode);
4704 Py_XDECREF(errorHandler);
4705 Py_XDECREF(exc);
4706 return NULL;
4707}
4708
4709PyObject *
4710PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004711 Py_ssize_t size,
4712 const char *errors,
4713 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004714{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004715 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004716 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004717 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004718#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004719 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004720#else
4721 const int pairs = 0;
4722#endif
4723 /* Offsets from p for storing byte pairs in the right order. */
4724#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4725 int iorder[] = {0, 1, 2, 3};
4726#else
4727 int iorder[] = {3, 2, 1, 0};
4728#endif
4729
Benjamin Peterson29060642009-01-31 22:14:21 +00004730#define STORECHAR(CH) \
4731 do { \
4732 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4733 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4734 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4735 p[iorder[0]] = (CH) & 0xff; \
4736 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004737 } while(0)
4738
4739 /* In narrow builds we can output surrogate pairs as one codepoint,
4740 so we need less space. */
4741#ifndef Py_UNICODE_WIDE
4742 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4744 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4745 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004746#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004747 nsize = (size - pairs + (byteorder == 0));
4748 bytesize = nsize * 4;
4749 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004751 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752 if (v == NULL)
4753 return NULL;
4754
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004755 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004756 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004758 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004759 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004760
4761 if (byteorder == -1) {
4762 /* force LE */
4763 iorder[0] = 0;
4764 iorder[1] = 1;
4765 iorder[2] = 2;
4766 iorder[3] = 3;
4767 }
4768 else if (byteorder == 1) {
4769 /* force BE */
4770 iorder[0] = 3;
4771 iorder[1] = 2;
4772 iorder[2] = 1;
4773 iorder[3] = 0;
4774 }
4775
4776 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004778#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004779 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4780 Py_UCS4 ch2 = *s;
4781 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4782 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4783 s++;
4784 size--;
4785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004786 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004787#endif
4788 STORECHAR(ch);
4789 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004790
4791 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004792 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004793#undef STORECHAR
4794}
4795
Alexander Belopolsky40018472011-02-26 01:02:56 +00004796PyObject *
4797PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004798{
4799 if (!PyUnicode_Check(unicode)) {
4800 PyErr_BadArgument();
4801 return NULL;
4802 }
4803 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 PyUnicode_GET_SIZE(unicode),
4805 NULL,
4806 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004807}
4808
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809/* --- UTF-16 Codec ------------------------------------------------------- */
4810
Tim Peters772747b2001-08-09 22:21:55 +00004811PyObject *
4812PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 Py_ssize_t size,
4814 const char *errors,
4815 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816{
Walter Dörwald69652032004-09-07 20:24:22 +00004817 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4818}
4819
Antoine Pitrouab868312009-01-10 15:40:25 +00004820/* Two masks for fast checking of whether a C 'long' may contain
4821 UTF16-encoded surrogate characters. This is an efficient heuristic,
4822 assuming that non-surrogate characters with a code point >= 0x8000 are
4823 rare in most input.
4824 FAST_CHAR_MASK is used when the input is in native byte ordering,
4825 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004826*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004827#if (SIZEOF_LONG == 8)
4828# define FAST_CHAR_MASK 0x8000800080008000L
4829# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4830#elif (SIZEOF_LONG == 4)
4831# define FAST_CHAR_MASK 0x80008000L
4832# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4833#else
4834# error C 'long' size should be either 4 or 8!
4835#endif
4836
Walter Dörwald69652032004-09-07 20:24:22 +00004837PyObject *
4838PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 Py_ssize_t size,
4840 const char *errors,
4841 int *byteorder,
4842 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004843{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004844 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004845 Py_ssize_t startinpos;
4846 Py_ssize_t endinpos;
4847 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 PyUnicodeObject *unicode;
4849 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004850 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004851 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004852 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004853 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004854 /* Offsets from q for retrieving byte pairs in the right order. */
4855#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4856 int ihi = 1, ilo = 0;
4857#else
4858 int ihi = 0, ilo = 1;
4859#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 PyObject *errorHandler = NULL;
4861 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862
4863 /* Note: size will always be longer than the resulting Unicode
4864 character count */
4865 unicode = _PyUnicode_New(size);
4866 if (!unicode)
4867 return NULL;
4868 if (size == 0)
4869 return (PyObject *)unicode;
4870
4871 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004872 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004873 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004874 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875
4876 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004877 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004879 /* Check for BOM marks (U+FEFF) in the input and adjust current
4880 byte order setting accordingly. In native mode, the leading BOM
4881 mark is skipped, in all other modes, it is copied to the output
4882 stream as-is (giving a ZWNBSP character). */
4883 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004884 if (size >= 2) {
4885 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004886#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 if (bom == 0xFEFF) {
4888 q += 2;
4889 bo = -1;
4890 }
4891 else if (bom == 0xFFFE) {
4892 q += 2;
4893 bo = 1;
4894 }
Tim Petersced69f82003-09-16 20:30:58 +00004895#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 if (bom == 0xFEFF) {
4897 q += 2;
4898 bo = 1;
4899 }
4900 else if (bom == 0xFFFE) {
4901 q += 2;
4902 bo = -1;
4903 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004904#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907
Tim Peters772747b2001-08-09 22:21:55 +00004908 if (bo == -1) {
4909 /* force LE */
4910 ihi = 1;
4911 ilo = 0;
4912 }
4913 else if (bo == 1) {
4914 /* force BE */
4915 ihi = 0;
4916 ilo = 1;
4917 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004918#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4919 native_ordering = ilo < ihi;
4920#else
4921 native_ordering = ilo > ihi;
4922#endif
Tim Peters772747b2001-08-09 22:21:55 +00004923
Antoine Pitrouab868312009-01-10 15:40:25 +00004924 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004925 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004927 /* First check for possible aligned read of a C 'long'. Unaligned
4928 reads are more expensive, better to defer to another iteration. */
4929 if (!((size_t) q & LONG_PTR_MASK)) {
4930 /* Fast path for runs of non-surrogate chars. */
4931 register const unsigned char *_q = q;
4932 Py_UNICODE *_p = p;
4933 if (native_ordering) {
4934 /* Native ordering is simple: as long as the input cannot
4935 possibly contain a surrogate char, do an unrolled copy
4936 of several 16-bit code points to the target object.
4937 The non-surrogate check is done on several input bytes
4938 at a time (as many as a C 'long' can contain). */
4939 while (_q < aligned_end) {
4940 unsigned long data = * (unsigned long *) _q;
4941 if (data & FAST_CHAR_MASK)
4942 break;
4943 _p[0] = ((unsigned short *) _q)[0];
4944 _p[1] = ((unsigned short *) _q)[1];
4945#if (SIZEOF_LONG == 8)
4946 _p[2] = ((unsigned short *) _q)[2];
4947 _p[3] = ((unsigned short *) _q)[3];
4948#endif
4949 _q += SIZEOF_LONG;
4950 _p += SIZEOF_LONG / 2;
4951 }
4952 }
4953 else {
4954 /* Byteswapped ordering is similar, but we must decompose
4955 the copy bytewise, and take care of zero'ing out the
4956 upper bytes if the target object is in 32-bit units
4957 (that is, in UCS-4 builds). */
4958 while (_q < aligned_end) {
4959 unsigned long data = * (unsigned long *) _q;
4960 if (data & SWAPPED_FAST_CHAR_MASK)
4961 break;
4962 /* Zero upper bytes in UCS-4 builds */
4963#if (Py_UNICODE_SIZE > 2)
4964 _p[0] = 0;
4965 _p[1] = 0;
4966#if (SIZEOF_LONG == 8)
4967 _p[2] = 0;
4968 _p[3] = 0;
4969#endif
4970#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004971 /* Issue #4916; UCS-4 builds on big endian machines must
4972 fill the two last bytes of each 4-byte unit. */
4973#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4974# define OFF 2
4975#else
4976# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004977#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004978 ((unsigned char *) _p)[OFF + 1] = _q[0];
4979 ((unsigned char *) _p)[OFF + 0] = _q[1];
4980 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4981 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4982#if (SIZEOF_LONG == 8)
4983 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4984 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4985 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4986 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4987#endif
4988#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004989 _q += SIZEOF_LONG;
4990 _p += SIZEOF_LONG / 2;
4991 }
4992 }
4993 p = _p;
4994 q = _q;
4995 if (q >= e)
4996 break;
4997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999
Benjamin Peterson14339b62009-01-31 16:36:08 +00005000 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005001
5002 if (ch < 0xD800 || ch > 0xDFFF) {
5003 *p++ = ch;
5004 continue;
5005 }
5006
5007 /* UTF-16 code pair: */
5008 if (q > e) {
5009 errmsg = "unexpected end of data";
5010 startinpos = (((const char *)q) - 2) - starts;
5011 endinpos = ((const char *)e) + 1 - starts;
5012 goto utf16Error;
5013 }
5014 if (0xD800 <= ch && ch <= 0xDBFF) {
5015 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5016 q += 2;
5017 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005018#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005019 *p++ = ch;
5020 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005021#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005023#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 continue;
5025 }
5026 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005027 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 startinpos = (((const char *)q)-4)-starts;
5029 endinpos = startinpos+2;
5030 goto utf16Error;
5031 }
5032
Benjamin Peterson14339b62009-01-31 16:36:08 +00005033 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 errmsg = "illegal encoding";
5035 startinpos = (((const char *)q)-2)-starts;
5036 endinpos = startinpos+2;
5037 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005038
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 utf16Error:
5040 outpos = p - PyUnicode_AS_UNICODE(unicode);
5041 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005042 errors,
5043 &errorHandler,
5044 "utf16", errmsg,
5045 &starts,
5046 (const char **)&e,
5047 &startinpos,
5048 &endinpos,
5049 &exc,
5050 (const char **)&q,
5051 &unicode,
5052 &outpos,
5053 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005056 /* remaining byte at the end? (size should be even) */
5057 if (e == q) {
5058 if (!consumed) {
5059 errmsg = "truncated data";
5060 startinpos = ((const char *)q) - starts;
5061 endinpos = ((const char *)e) + 1 - starts;
5062 outpos = p - PyUnicode_AS_UNICODE(unicode);
5063 if (unicode_decode_call_errorhandler(
5064 errors,
5065 &errorHandler,
5066 "utf16", errmsg,
5067 &starts,
5068 (const char **)&e,
5069 &startinpos,
5070 &endinpos,
5071 &exc,
5072 (const char **)&q,
5073 &unicode,
5074 &outpos,
5075 &p))
5076 goto onError;
5077 /* The remaining input chars are ignored if the callback
5078 chooses to skip the input */
5079 }
5080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081
5082 if (byteorder)
5083 *byteorder = bo;
5084
Walter Dörwald69652032004-09-07 20:24:22 +00005085 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005087
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005089 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 goto onError;
5091
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092 Py_XDECREF(errorHandler);
5093 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005094 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005095 Py_DECREF(unicode);
5096 return NULL;
5097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 return (PyObject *)unicode;
5099
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005102 Py_XDECREF(errorHandler);
5103 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 return NULL;
5105}
5106
Antoine Pitrouab868312009-01-10 15:40:25 +00005107#undef FAST_CHAR_MASK
5108#undef SWAPPED_FAST_CHAR_MASK
5109
Tim Peters772747b2001-08-09 22:21:55 +00005110PyObject *
5111PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 Py_ssize_t size,
5113 const char *errors,
5114 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005116 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005117 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005118 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005119#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005120 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005121#else
5122 const int pairs = 0;
5123#endif
Tim Peters772747b2001-08-09 22:21:55 +00005124 /* Offsets from p for storing byte pairs in the right order. */
5125#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5126 int ihi = 1, ilo = 0;
5127#else
5128 int ihi = 0, ilo = 1;
5129#endif
5130
Benjamin Peterson29060642009-01-31 22:14:21 +00005131#define STORECHAR(CH) \
5132 do { \
5133 p[ihi] = ((CH) >> 8) & 0xff; \
5134 p[ilo] = (CH) & 0xff; \
5135 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005136 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005138#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005139 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 if (s[i] >= 0x10000)
5141 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005142#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005143 /* 2 * (size + pairs + (byteorder == 0)) */
5144 if (size > PY_SSIZE_T_MAX ||
5145 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005147 nsize = size + pairs + (byteorder == 0);
5148 bytesize = nsize * 2;
5149 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005150 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005151 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 if (v == NULL)
5153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005155 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005158 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005159 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005160
5161 if (byteorder == -1) {
5162 /* force LE */
5163 ihi = 1;
5164 ilo = 0;
5165 }
5166 else if (byteorder == 1) {
5167 /* force BE */
5168 ihi = 0;
5169 ilo = 1;
5170 }
5171
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005172 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 Py_UNICODE ch = *s++;
5174 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005175#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 if (ch >= 0x10000) {
5177 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5178 ch = 0xD800 | ((ch-0x10000) >> 10);
5179 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005180#endif
Tim Peters772747b2001-08-09 22:21:55 +00005181 STORECHAR(ch);
5182 if (ch2)
5183 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005184 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005185
5186 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005187 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005188#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189}
5190
Alexander Belopolsky40018472011-02-26 01:02:56 +00005191PyObject *
5192PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193{
5194 if (!PyUnicode_Check(unicode)) {
5195 PyErr_BadArgument();
5196 return NULL;
5197 }
5198 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 PyUnicode_GET_SIZE(unicode),
5200 NULL,
5201 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}
5203
5204/* --- Unicode Escape Codec ----------------------------------------------- */
5205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005206/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5207 if all the escapes in the string make it still a valid ASCII string.
5208 Returns -1 if any escapes were found which cause the string to
5209 pop out of ASCII range. Otherwise returns the length of the
5210 required buffer to hold the string.
5211 */
5212Py_ssize_t
5213length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5214{
5215 const unsigned char *p = (const unsigned char *)s;
5216 const unsigned char *end = p + size;
5217 Py_ssize_t length = 0;
5218
5219 if (size < 0)
5220 return -1;
5221
5222 for (; p < end; ++p) {
5223 if (*p > 127) {
5224 /* Non-ASCII */
5225 return -1;
5226 }
5227 else if (*p != '\\') {
5228 /* Normal character */
5229 ++length;
5230 }
5231 else {
5232 /* Backslash-escape, check next char */
5233 ++p;
5234 /* Escape sequence reaches till end of string or
5235 non-ASCII follow-up. */
5236 if (p >= end || *p > 127)
5237 return -1;
5238 switch (*p) {
5239 case '\n':
5240 /* backslash + \n result in zero characters */
5241 break;
5242 case '\\': case '\'': case '\"':
5243 case 'b': case 'f': case 't':
5244 case 'n': case 'r': case 'v': case 'a':
5245 ++length;
5246 break;
5247 case '0': case '1': case '2': case '3':
5248 case '4': case '5': case '6': case '7':
5249 case 'x': case 'u': case 'U': case 'N':
5250 /* these do not guarantee ASCII characters */
5251 return -1;
5252 default:
5253 /* count the backslash + the other character */
5254 length += 2;
5255 }
5256 }
5257 }
5258 return length;
5259}
5260
5261/* Similar to PyUnicode_WRITE but either write into wstr field
5262 or treat string as ASCII. */
5263#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5264 do { \
5265 if ((kind) != PyUnicode_WCHAR_KIND) \
5266 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5267 else \
5268 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5269 } while (0)
5270
5271#define WRITE_WSTR(buf, index, value) \
5272 assert(kind == PyUnicode_WCHAR_KIND), \
5273 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5274
5275
Fredrik Lundh06d12682001-01-24 07:59:11 +00005276static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005277
Alexander Belopolsky40018472011-02-26 01:02:56 +00005278PyObject *
5279PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005280 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005281 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005284 Py_ssize_t startinpos;
5285 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005286 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005288 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005290 char* message;
5291 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005292 PyObject *errorHandler = NULL;
5293 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005294 Py_ssize_t ascii_length;
5295 Py_ssize_t i;
5296 int kind;
5297 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005299 ascii_length = length_of_escaped_ascii_string(s, size);
5300
5301 /* After length_of_escaped_ascii_string() there are two alternatives,
5302 either the string is pure ASCII with named escapes like \n, etc.
5303 and we determined it's exact size (common case)
5304 or it contains \x, \u, ... escape sequences. then we create a
5305 legacy wchar string and resize it at the end of this function. */
5306 if (ascii_length >= 0) {
5307 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5308 if (!v)
5309 goto onError;
5310 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5311 kind = PyUnicode_1BYTE_KIND;
5312 data = PyUnicode_DATA(v);
5313 }
5314 else {
5315 /* Escaped strings will always be longer than the resulting
5316 Unicode string, so we start with size here and then reduce the
5317 length after conversion to the true value.
5318 (but if the error callback returns a long replacement string
5319 we'll have to allocate more space) */
5320 v = _PyUnicode_New(size);
5321 if (!v)
5322 goto onError;
5323 kind = PyUnicode_WCHAR_KIND;
5324 data = PyUnicode_AS_UNICODE(v);
5325 }
5326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 if (size == 0)
5328 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005329 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005331
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 while (s < end) {
5333 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005334 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005337 if (kind == PyUnicode_WCHAR_KIND) {
5338 assert(i < _PyUnicode_WSTR_LENGTH(v));
5339 }
5340 else {
5341 /* The only case in which i == ascii_length is a backslash
5342 followed by a newline. */
5343 assert(i <= ascii_length);
5344 }
5345
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 /* Non-escape characters are interpreted as Unicode ordinals */
5347 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005348 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 continue;
5350 }
5351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 /* \ - Escapes */
5354 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005355 c = *s++;
5356 if (s > end)
5357 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005358
5359 if (kind == PyUnicode_WCHAR_KIND) {
5360 assert(i < _PyUnicode_WSTR_LENGTH(v));
5361 }
5362 else {
5363 /* The only case in which i == ascii_length is a backslash
5364 followed by a newline. */
5365 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5366 }
5367
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005368 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5373 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5374 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5375 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5376 /* FF */
5377 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5378 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5379 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5380 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5381 /* VT */
5382 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5383 /* BEL, not classic C */
5384 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 case '0': case '1': case '2': case '3':
5388 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005389 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005390 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005391 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005392 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005393 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005395 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 break;
5397
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 /* hex escapes */
5399 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005401 digits = 2;
5402 message = "truncated \\xXX escape";
5403 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005407 digits = 4;
5408 message = "truncated \\uXXXX escape";
5409 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005412 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005413 digits = 8;
5414 message = "truncated \\UXXXXXXXX escape";
5415 hexescape:
5416 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005417 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005418 if (s+digits>end) {
5419 endinpos = size;
5420 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 errors, &errorHandler,
5422 "unicodeescape", "end of string in escape sequence",
5423 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005427 goto nextByte;
5428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005429 for (j = 0; j < digits; ++j) {
5430 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005431 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005432 endinpos = (s+j+1)-starts;
5433 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 errors, &errorHandler,
5436 "unicodeescape", message,
5437 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005439 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005440 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005442 }
5443 chr = (chr<<4) & ~0xF;
5444 if (c >= '0' && c <= '9')
5445 chr += c - '0';
5446 else if (c >= 'a' && c <= 'f')
5447 chr += 10 + c - 'a';
5448 else
5449 chr += 10 + c - 'A';
5450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005451 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005452 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 /* _decoding_error will have already written into the
5454 target buffer. */
5455 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005456 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005457 /* when we get here, chr is a 32-bit unicode character */
5458 if (chr <= 0xffff)
5459 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005460 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005461 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005462 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005463 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005464#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005466#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005467 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5469 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005470#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005471 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005472 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005473 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005474 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 errors, &errorHandler,
5476 "unicodeescape", "illegal Unicode character",
5477 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005478 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005479 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005480 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005481 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005482 break;
5483
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005485 case 'N':
5486 message = "malformed \\N character escape";
5487 if (ucnhash_CAPI == NULL) {
5488 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5490 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005491 if (ucnhash_CAPI == NULL)
5492 goto ucnhashError;
5493 }
5494 if (*s == '{') {
5495 const char *start = s+1;
5496 /* look for the closing brace */
5497 while (*s != '}' && s < end)
5498 s++;
5499 if (s > start && s < end && *s == '}') {
5500 /* found a name. look it up in the unicode database */
5501 message = "unknown Unicode character name";
5502 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005503 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5504 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005505 goto store;
5506 }
5507 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005510 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 errors, &errorHandler,
5512 "unicodeescape", message,
5513 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005514 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005515 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005517 break;
5518
5519 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005520 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005522 message = "\\ at end of string";
5523 s--;
5524 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 errors, &errorHandler,
5528 "unicodeescape", message,
5529 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005530 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005531 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005533 }
5534 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5536 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005537 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005538 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005541 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 /* Ensure the length prediction worked in case of ASCII strings */
5544 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5545
Victor Stinnerfe226c02011-10-03 03:52:20 +02005546 if (kind == PyUnicode_WCHAR_KIND)
5547 {
5548 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5549 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005550 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005551 Py_XDECREF(errorHandler);
5552 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005553 if (_PyUnicode_READY_REPLACE(&v)) {
5554 Py_DECREF(v);
5555 return NULL;
5556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005558
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005560 PyErr_SetString(
5561 PyExc_UnicodeError,
5562 "\\N escapes not supported (can't load unicodedata module)"
5563 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005564 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005565 Py_XDECREF(errorHandler);
5566 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005567 return NULL;
5568
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 Py_XDECREF(errorHandler);
5572 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 return NULL;
5574}
5575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576#undef WRITE_ASCII_OR_WSTR
5577#undef WRITE_WSTR
5578
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579/* Return a Unicode-Escape string version of the Unicode object.
5580
5581 If quotes is true, the string is enclosed in u"" or u'' quotes as
5582 appropriate.
5583
5584*/
5585
Walter Dörwald79e913e2007-05-12 11:08:06 +00005586static const char *hexdigits = "0123456789abcdef";
5587
Alexander Belopolsky40018472011-02-26 01:02:56 +00005588PyObject *
5589PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005590 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005592 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005595#ifdef Py_UNICODE_WIDE
5596 const Py_ssize_t expandsize = 10;
5597#else
5598 const Py_ssize_t expandsize = 6;
5599#endif
5600
Thomas Wouters89f507f2006-12-13 04:49:30 +00005601 /* XXX(nnorwitz): rather than over-allocating, it would be
5602 better to choose a different scheme. Perhaps scan the
5603 first N-chars of the string and allocate based on that size.
5604 */
5605 /* Initial allocation is based on the longest-possible unichr
5606 escape.
5607
5608 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5609 unichr, so in this case it's the longest unichr escape. In
5610 narrow (UTF-16) builds this is five chars per source unichr
5611 since there are two unichrs in the surrogate pair, so in narrow
5612 (UTF-16) builds it's not the longest unichr escape.
5613
5614 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5615 so in the narrow (UTF-16) build case it's the longest unichr
5616 escape.
5617 */
5618
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005619 if (size == 0)
5620 return PyBytes_FromStringAndSize(NULL, 0);
5621
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005622 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005624
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005625 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 2
5627 + expandsize*size
5628 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 if (repr == NULL)
5630 return NULL;
5631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005632 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 while (size-- > 0) {
5635 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005636
Walter Dörwald79e913e2007-05-12 11:08:06 +00005637 /* Escape backslashes */
5638 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 *p++ = '\\';
5640 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005641 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005642 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005643
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005644#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005645 /* Map 21-bit characters to '\U00xxxxxx' */
5646 else if (ch >= 0x10000) {
5647 *p++ = '\\';
5648 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005649 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5650 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5651 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5652 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5653 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5654 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5655 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5656 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005658 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005659#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5661 else if (ch >= 0xD800 && ch < 0xDC00) {
5662 Py_UNICODE ch2;
5663 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005664
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 ch2 = *s++;
5666 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005667 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5669 *p++ = '\\';
5670 *p++ = 'U';
5671 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5672 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5673 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5674 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5675 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5676 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5677 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5678 *p++ = hexdigits[ucs & 0x0000000F];
5679 continue;
5680 }
5681 /* Fall through: isolated surrogates are copied as-is */
5682 s--;
5683 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005684 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005685#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005688 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 *p++ = '\\';
5690 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005691 *p++ = hexdigits[(ch >> 12) & 0x000F];
5692 *p++ = hexdigits[(ch >> 8) & 0x000F];
5693 *p++ = hexdigits[(ch >> 4) & 0x000F];
5694 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005696
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005697 /* Map special whitespace to '\t', \n', '\r' */
5698 else if (ch == '\t') {
5699 *p++ = '\\';
5700 *p++ = 't';
5701 }
5702 else if (ch == '\n') {
5703 *p++ = '\\';
5704 *p++ = 'n';
5705 }
5706 else if (ch == '\r') {
5707 *p++ = '\\';
5708 *p++ = 'r';
5709 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005710
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005711 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005712 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005714 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005715 *p++ = hexdigits[(ch >> 4) & 0x000F];
5716 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005717 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005718
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 /* Copy everything else as-is */
5720 else
5721 *p++ = (char) ch;
5722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005724 assert(p - PyBytes_AS_STRING(repr) > 0);
5725 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5726 return NULL;
5727 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728}
5729
Alexander Belopolsky40018472011-02-26 01:02:56 +00005730PyObject *
5731PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005733 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 if (!PyUnicode_Check(unicode)) {
5735 PyErr_BadArgument();
5736 return NULL;
5737 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005738 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5739 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005740 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741}
5742
5743/* --- Raw Unicode Escape Codec ------------------------------------------- */
5744
Alexander Belopolsky40018472011-02-26 01:02:56 +00005745PyObject *
5746PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005747 Py_ssize_t size,
5748 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005751 Py_ssize_t startinpos;
5752 Py_ssize_t endinpos;
5753 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 const char *end;
5757 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 PyObject *errorHandler = NULL;
5759 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005760
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 /* Escaped strings will always be longer than the resulting
5762 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 length after conversion to the true value. (But decoding error
5764 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 v = _PyUnicode_New(size);
5766 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 end = s + size;
5772 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 unsigned char c;
5774 Py_UCS4 x;
5775 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005776 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 /* Non-escape characters are interpreted as Unicode ordinals */
5779 if (*s != '\\') {
5780 *p++ = (unsigned char)*s++;
5781 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005782 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 startinpos = s-starts;
5784
5785 /* \u-escapes are only interpreted iff the number of leading
5786 backslashes if odd */
5787 bs = s;
5788 for (;s < end;) {
5789 if (*s != '\\')
5790 break;
5791 *p++ = (unsigned char)*s++;
5792 }
5793 if (((s - bs) & 1) == 0 ||
5794 s >= end ||
5795 (*s != 'u' && *s != 'U')) {
5796 continue;
5797 }
5798 p--;
5799 count = *s=='u' ? 4 : 8;
5800 s++;
5801
5802 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5803 outpos = p-PyUnicode_AS_UNICODE(v);
5804 for (x = 0, i = 0; i < count; ++i, ++s) {
5805 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005806 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 endinpos = s-starts;
5808 if (unicode_decode_call_errorhandler(
5809 errors, &errorHandler,
5810 "rawunicodeescape", "truncated \\uXXXX",
5811 &starts, &end, &startinpos, &endinpos, &exc, &s,
5812 &v, &outpos, &p))
5813 goto onError;
5814 goto nextByte;
5815 }
5816 x = (x<<4) & ~0xF;
5817 if (c >= '0' && c <= '9')
5818 x += c - '0';
5819 else if (c >= 'a' && c <= 'f')
5820 x += 10 + c - 'a';
5821 else
5822 x += 10 + c - 'A';
5823 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005824 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 /* UCS-2 character */
5826 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005827 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 /* UCS-4 character. Either store directly, or as
5829 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005830#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005832#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 x -= 0x10000L;
5834 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5835 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005836#endif
5837 } else {
5838 endinpos = s-starts;
5839 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005840 if (unicode_decode_call_errorhandler(
5841 errors, &errorHandler,
5842 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 &starts, &end, &startinpos, &endinpos, &exc, &s,
5844 &v, &outpos, &p))
5845 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005846 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 nextByte:
5848 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005850 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 Py_XDECREF(errorHandler);
5853 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005854 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855 Py_DECREF(v);
5856 return NULL;
5857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005859
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 Py_XDECREF(errorHandler);
5863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 return NULL;
5865}
5866
Alexander Belopolsky40018472011-02-26 01:02:56 +00005867PyObject *
5868PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005869 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005871 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 char *p;
5873 char *q;
5874
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005875#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005876 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005877#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005878 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005879#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005880
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005881 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005883
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005884 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 if (repr == NULL)
5886 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005887 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005888 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005890 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 while (size-- > 0) {
5892 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005893#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 /* Map 32-bit characters to '\Uxxxxxxxx' */
5895 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005896 *p++ = '\\';
5897 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005898 *p++ = hexdigits[(ch >> 28) & 0xf];
5899 *p++ = hexdigits[(ch >> 24) & 0xf];
5900 *p++ = hexdigits[(ch >> 20) & 0xf];
5901 *p++ = hexdigits[(ch >> 16) & 0xf];
5902 *p++ = hexdigits[(ch >> 12) & 0xf];
5903 *p++ = hexdigits[(ch >> 8) & 0xf];
5904 *p++ = hexdigits[(ch >> 4) & 0xf];
5905 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005906 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005907 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005908#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5910 if (ch >= 0xD800 && ch < 0xDC00) {
5911 Py_UNICODE ch2;
5912 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005913
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 ch2 = *s++;
5915 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005916 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5918 *p++ = '\\';
5919 *p++ = 'U';
5920 *p++ = hexdigits[(ucs >> 28) & 0xf];
5921 *p++ = hexdigits[(ucs >> 24) & 0xf];
5922 *p++ = hexdigits[(ucs >> 20) & 0xf];
5923 *p++ = hexdigits[(ucs >> 16) & 0xf];
5924 *p++ = hexdigits[(ucs >> 12) & 0xf];
5925 *p++ = hexdigits[(ucs >> 8) & 0xf];
5926 *p++ = hexdigits[(ucs >> 4) & 0xf];
5927 *p++ = hexdigits[ucs & 0xf];
5928 continue;
5929 }
5930 /* Fall through: isolated surrogates are copied as-is */
5931 s--;
5932 size++;
5933 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005934#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 /* Map 16-bit characters to '\uxxxx' */
5936 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 *p++ = '\\';
5938 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005939 *p++ = hexdigits[(ch >> 12) & 0xf];
5940 *p++ = hexdigits[(ch >> 8) & 0xf];
5941 *p++ = hexdigits[(ch >> 4) & 0xf];
5942 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 /* Copy everything else as-is */
5945 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 *p++ = (char) ch;
5947 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005948 size = p - q;
5949
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005950 assert(size > 0);
5951 if (_PyBytes_Resize(&repr, size) < 0)
5952 return NULL;
5953 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954}
5955
Alexander Belopolsky40018472011-02-26 01:02:56 +00005956PyObject *
5957PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005959 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005961 PyErr_BadArgument();
5962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005964 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5965 PyUnicode_GET_SIZE(unicode));
5966
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005967 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968}
5969
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005970/* --- Unicode Internal Codec ------------------------------------------- */
5971
Alexander Belopolsky40018472011-02-26 01:02:56 +00005972PyObject *
5973_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005974 Py_ssize_t size,
5975 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005976{
5977 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978 Py_ssize_t startinpos;
5979 Py_ssize_t endinpos;
5980 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005981 PyUnicodeObject *v;
5982 Py_UNICODE *p;
5983 const char *end;
5984 const char *reason;
5985 PyObject *errorHandler = NULL;
5986 PyObject *exc = NULL;
5987
Neal Norwitzd43069c2006-01-08 01:12:10 +00005988#ifdef Py_UNICODE_WIDE
5989 Py_UNICODE unimax = PyUnicode_GetMax();
5990#endif
5991
Thomas Wouters89f507f2006-12-13 04:49:30 +00005992 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005993 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5994 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005996 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5997 as string was created with the old API. */
5998 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006000 p = PyUnicode_AS_UNICODE(v);
6001 end = s + size;
6002
6003 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006004 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006005 /* We have to sanity check the raw data, otherwise doom looms for
6006 some malformed UCS-4 data. */
6007 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006008#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006009 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006010#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006011 end-s < Py_UNICODE_SIZE
6012 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006014 startinpos = s - starts;
6015 if (end-s < Py_UNICODE_SIZE) {
6016 endinpos = end-starts;
6017 reason = "truncated input";
6018 }
6019 else {
6020 endinpos = s - starts + Py_UNICODE_SIZE;
6021 reason = "illegal code point (> 0x10FFFF)";
6022 }
6023 outpos = p - PyUnicode_AS_UNICODE(v);
6024 if (unicode_decode_call_errorhandler(
6025 errors, &errorHandler,
6026 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006027 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006028 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006029 goto onError;
6030 }
6031 }
6032 else {
6033 p++;
6034 s += Py_UNICODE_SIZE;
6035 }
6036 }
6037
Victor Stinnerfe226c02011-10-03 03:52:20 +02006038 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006039 goto onError;
6040 Py_XDECREF(errorHandler);
6041 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006042 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006043 Py_DECREF(v);
6044 return NULL;
6045 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 return (PyObject *)v;
6047
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006049 Py_XDECREF(v);
6050 Py_XDECREF(errorHandler);
6051 Py_XDECREF(exc);
6052 return NULL;
6053}
6054
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055/* --- Latin-1 Codec ------------------------------------------------------ */
6056
Alexander Belopolsky40018472011-02-26 01:02:56 +00006057PyObject *
6058PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006059 Py_ssize_t size,
6060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006063 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064}
6065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006067static void
6068make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006069 const char *encoding,
6070 const Py_UNICODE *unicode, Py_ssize_t size,
6071 Py_ssize_t startpos, Py_ssize_t endpos,
6072 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 *exceptionObject = PyUnicodeEncodeError_Create(
6076 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 }
6078 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6080 goto onError;
6081 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6082 goto onError;
6083 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6084 goto onError;
6085 return;
6086 onError:
6087 Py_DECREF(*exceptionObject);
6088 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 }
6090}
6091
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006093static void
6094raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006095 const char *encoding,
6096 const Py_UNICODE *unicode, Py_ssize_t size,
6097 Py_ssize_t startpos, Py_ssize_t endpos,
6098 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099{
6100 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104}
6105
6106/* error handling callback helper:
6107 build arguments, call the callback and check the arguments,
6108 put the result into newpos and return the replacement string, which
6109 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006110static PyObject *
6111unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006112 PyObject **errorHandler,
6113 const char *encoding, const char *reason,
6114 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6115 Py_ssize_t startpos, Py_ssize_t endpos,
6116 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006118 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119
6120 PyObject *restuple;
6121 PyObject *resunicode;
6122
6123 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006125 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 }
6128
6129 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133
6134 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006139 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 Py_DECREF(restuple);
6141 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006143 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 &resunicode, newpos)) {
6145 Py_DECREF(restuple);
6146 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006148 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6149 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6150 Py_DECREF(restuple);
6151 return NULL;
6152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006155 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6157 Py_DECREF(restuple);
6158 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 Py_INCREF(resunicode);
6161 Py_DECREF(restuple);
6162 return resunicode;
6163}
6164
Alexander Belopolsky40018472011-02-26 01:02:56 +00006165static PyObject *
6166unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006167 Py_ssize_t size,
6168 const char *errors,
6169 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170{
6171 /* output object */
6172 PyObject *res;
6173 /* pointers to the beginning and end+1 of input */
6174 const Py_UNICODE *startp = p;
6175 const Py_UNICODE *endp = p + size;
6176 /* pointer to the beginning of the unencodable characters */
6177 /* const Py_UNICODE *badp = NULL; */
6178 /* pointer into the output */
6179 char *str;
6180 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006181 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006182 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6183 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 PyObject *errorHandler = NULL;
6185 PyObject *exc = NULL;
6186 /* the following variable is used for caching string comparisons
6187 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6188 int known_errorHandler = -1;
6189
6190 /* allocate enough for a simple encoding without
6191 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006192 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006193 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006194 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006195 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006196 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006197 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 ressize = size;
6199
6200 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* can we encode this? */
6204 if (c<limit) {
6205 /* no overflow check, because we know that the space is enough */
6206 *str++ = (char)c;
6207 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006208 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 else {
6210 Py_ssize_t unicodepos = p-startp;
6211 Py_ssize_t requiredsize;
6212 PyObject *repunicode;
6213 Py_ssize_t repsize;
6214 Py_ssize_t newpos;
6215 Py_ssize_t respos;
6216 Py_UNICODE *uni2;
6217 /* startpos for collecting unencodable chars */
6218 const Py_UNICODE *collstart = p;
6219 const Py_UNICODE *collend = p;
6220 /* find all unecodable characters */
6221 while ((collend < endp) && ((*collend)>=limit))
6222 ++collend;
6223 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6224 if (known_errorHandler==-1) {
6225 if ((errors==NULL) || (!strcmp(errors, "strict")))
6226 known_errorHandler = 1;
6227 else if (!strcmp(errors, "replace"))
6228 known_errorHandler = 2;
6229 else if (!strcmp(errors, "ignore"))
6230 known_errorHandler = 3;
6231 else if (!strcmp(errors, "xmlcharrefreplace"))
6232 known_errorHandler = 4;
6233 else
6234 known_errorHandler = 0;
6235 }
6236 switch (known_errorHandler) {
6237 case 1: /* strict */
6238 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6239 goto onError;
6240 case 2: /* replace */
6241 while (collstart++<collend)
6242 *str++ = '?'; /* fall through */
6243 case 3: /* ignore */
6244 p = collend;
6245 break;
6246 case 4: /* xmlcharrefreplace */
6247 respos = str - PyBytes_AS_STRING(res);
6248 /* determine replacement size (temporarily (mis)uses p) */
6249 for (p = collstart, repsize = 0; p < collend; ++p) {
6250 if (*p<10)
6251 repsize += 2+1+1;
6252 else if (*p<100)
6253 repsize += 2+2+1;
6254 else if (*p<1000)
6255 repsize += 2+3+1;
6256 else if (*p<10000)
6257 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006258#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 else
6260 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006261#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 else if (*p<100000)
6263 repsize += 2+5+1;
6264 else if (*p<1000000)
6265 repsize += 2+6+1;
6266 else
6267 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006268#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 }
6270 requiredsize = respos+repsize+(endp-collend);
6271 if (requiredsize > ressize) {
6272 if (requiredsize<2*ressize)
6273 requiredsize = 2*ressize;
6274 if (_PyBytes_Resize(&res, requiredsize))
6275 goto onError;
6276 str = PyBytes_AS_STRING(res) + respos;
6277 ressize = requiredsize;
6278 }
6279 /* generate replacement (temporarily (mis)uses p) */
6280 for (p = collstart; p < collend; ++p) {
6281 str += sprintf(str, "&#%d;", (int)*p);
6282 }
6283 p = collend;
6284 break;
6285 default:
6286 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6287 encoding, reason, startp, size, &exc,
6288 collstart-startp, collend-startp, &newpos);
6289 if (repunicode == NULL)
6290 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006291 if (PyBytes_Check(repunicode)) {
6292 /* Directly copy bytes result to output. */
6293 repsize = PyBytes_Size(repunicode);
6294 if (repsize > 1) {
6295 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006296 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006297 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6298 Py_DECREF(repunicode);
6299 goto onError;
6300 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006301 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006302 ressize += repsize-1;
6303 }
6304 memcpy(str, PyBytes_AsString(repunicode), repsize);
6305 str += repsize;
6306 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006307 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006308 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006309 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 /* need more space? (at least enough for what we
6311 have+the replacement+the rest of the string, so
6312 we won't have to check space for encodable characters) */
6313 respos = str - PyBytes_AS_STRING(res);
6314 repsize = PyUnicode_GET_SIZE(repunicode);
6315 requiredsize = respos+repsize+(endp-collend);
6316 if (requiredsize > ressize) {
6317 if (requiredsize<2*ressize)
6318 requiredsize = 2*ressize;
6319 if (_PyBytes_Resize(&res, requiredsize)) {
6320 Py_DECREF(repunicode);
6321 goto onError;
6322 }
6323 str = PyBytes_AS_STRING(res) + respos;
6324 ressize = requiredsize;
6325 }
6326 /* check if there is anything unencodable in the replacement
6327 and copy it to the output */
6328 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6329 c = *uni2;
6330 if (c >= limit) {
6331 raise_encode_exception(&exc, encoding, startp, size,
6332 unicodepos, unicodepos+1, reason);
6333 Py_DECREF(repunicode);
6334 goto onError;
6335 }
6336 *str = (char)c;
6337 }
6338 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006339 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006341 }
6342 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006343 /* Resize if we allocated to much */
6344 size = str - PyBytes_AS_STRING(res);
6345 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006346 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006347 if (_PyBytes_Resize(&res, size) < 0)
6348 goto onError;
6349 }
6350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 Py_XDECREF(errorHandler);
6352 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006353 return res;
6354
6355 onError:
6356 Py_XDECREF(res);
6357 Py_XDECREF(errorHandler);
6358 Py_XDECREF(exc);
6359 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360}
6361
Alexander Belopolsky40018472011-02-26 01:02:56 +00006362PyObject *
6363PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006364 Py_ssize_t size,
6365 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368}
6369
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006371_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
6373 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 PyErr_BadArgument();
6375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006377 if (PyUnicode_READY(unicode) == -1)
6378 return NULL;
6379 /* Fast path: if it is a one-byte string, construct
6380 bytes object directly. */
6381 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6382 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6383 PyUnicode_GET_LENGTH(unicode));
6384 /* Non-Latin-1 characters present. Defer to above function to
6385 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006388 errors);
6389}
6390
6391PyObject*
6392PyUnicode_AsLatin1String(PyObject *unicode)
6393{
6394 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395}
6396
6397/* --- 7-bit ASCII Codec -------------------------------------------------- */
6398
Alexander Belopolsky40018472011-02-26 01:02:56 +00006399PyObject *
6400PyUnicode_DecodeASCII(const char *s,
6401 Py_ssize_t size,
6402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 PyUnicodeObject *v;
6406 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006407 Py_ssize_t startinpos;
6408 Py_ssize_t endinpos;
6409 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006411 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 PyObject *errorHandler = NULL;
6413 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006414 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006417 if (size == 1 && *(unsigned char*)s < 128)
6418 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6419
6420 /* Fast path. Assume the input actually *is* ASCII, and allocate
6421 a single-block Unicode object with that assumption. If there is
6422 an error, drop the object and start over. */
6423 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6424 if (v == NULL)
6425 goto onError;
6426 d = PyUnicode_1BYTE_DATA(v);
6427 for (i = 0; i < size; i++) {
6428 unsigned char ch = ((unsigned char*)s)[i];
6429 if (ch < 128)
6430 d[i] = ch;
6431 else
6432 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006434 if (i == size)
6435 return (PyObject*)v;
6436 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006437
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 v = _PyUnicode_New(size);
6439 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 e = s + size;
6445 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 register unsigned char c = (unsigned char)*s;
6447 if (c < 128) {
6448 *p++ = c;
6449 ++s;
6450 }
6451 else {
6452 startinpos = s-starts;
6453 endinpos = startinpos + 1;
6454 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6455 if (unicode_decode_call_errorhandler(
6456 errors, &errorHandler,
6457 "ascii", "ordinal not in range(128)",
6458 &starts, &e, &startinpos, &endinpos, &exc, &s,
6459 &v, &outpos, &p))
6460 goto onError;
6461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006463 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006464 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466 Py_XDECREF(errorHandler);
6467 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006468 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006469 Py_DECREF(v);
6470 return NULL;
6471 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006473
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 Py_XDECREF(errorHandler);
6477 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 return NULL;
6479}
6480
Alexander Belopolsky40018472011-02-26 01:02:56 +00006481PyObject *
6482PyUnicode_EncodeASCII(const Py_UNICODE *p,
6483 Py_ssize_t size,
6484 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487}
6488
Alexander Belopolsky40018472011-02-26 01:02:56 +00006489PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006490_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491{
6492 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 PyErr_BadArgument();
6494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006496 if (PyUnicode_READY(unicode) == -1)
6497 return NULL;
6498 /* Fast path: if it is an ASCII-only string, construct bytes object
6499 directly. Else defer to above function to raise the exception. */
6500 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6501 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6502 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006505 errors);
6506}
6507
6508PyObject *
6509PyUnicode_AsASCIIString(PyObject *unicode)
6510{
6511 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512}
6513
Victor Stinner99b95382011-07-04 14:23:54 +02006514#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006515
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006516/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006517
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006518#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006519#define NEED_RETRY
6520#endif
6521
6522/* XXX This code is limited to "true" double-byte encodings, as
6523 a) it assumes an incomplete character consists of a single byte, and
6524 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527static int
6528is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006529{
6530 const char *curr = s + offset;
6531
6532 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 const char *prev = CharPrev(s, curr);
6534 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006535 }
6536 return 0;
6537}
6538
6539/*
6540 * Decode MBCS string into unicode object. If 'final' is set, converts
6541 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6542 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543static int
6544decode_mbcs(PyUnicodeObject **v,
6545 const char *s, /* MBCS string */
6546 int size, /* sizeof MBCS string */
6547 int final,
6548 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006549{
6550 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006551 Py_ssize_t n;
6552 DWORD usize;
6553 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006554
6555 assert(size >= 0);
6556
Victor Stinner554f3f02010-06-16 23:33:54 +00006557 /* check and handle 'errors' arg */
6558 if (errors==NULL || strcmp(errors, "strict")==0)
6559 flags = MB_ERR_INVALID_CHARS;
6560 else if (strcmp(errors, "ignore")==0)
6561 flags = 0;
6562 else {
6563 PyErr_Format(PyExc_ValueError,
6564 "mbcs encoding does not support errors='%s'",
6565 errors);
6566 return -1;
6567 }
6568
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006569 /* Skip trailing lead-byte unless 'final' is set */
6570 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006572
6573 /* First get the size of the result */
6574 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006575 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6576 if (usize==0)
6577 goto mbcs_decode_error;
6578 } else
6579 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006580
6581 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 /* Create unicode object */
6583 *v = _PyUnicode_New(usize);
6584 if (*v == NULL)
6585 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006586 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587 }
6588 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 /* Extend unicode object */
6590 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006591 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006593 }
6594
6595 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006596 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006598 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6599 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006601 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006603
6604mbcs_decode_error:
6605 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6606 we raise a UnicodeDecodeError - else it is a 'generic'
6607 windows error
6608 */
6609 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6610 /* Ideally, we should get reason from FormatMessage - this
6611 is the Windows 2000 English version of the message
6612 */
6613 PyObject *exc = NULL;
6614 const char *reason = "No mapping for the Unicode character exists "
6615 "in the target multi-byte code page.";
6616 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6617 if (exc != NULL) {
6618 PyCodec_StrictErrors(exc);
6619 Py_DECREF(exc);
6620 }
6621 } else {
6622 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6623 }
6624 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006625}
6626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
6628PyUnicode_DecodeMBCSStateful(const char *s,
6629 Py_ssize_t size,
6630 const char *errors,
6631 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006632{
6633 PyUnicodeObject *v = NULL;
6634 int done;
6635
6636 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638
6639#ifdef NEED_RETRY
6640 retry:
6641 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006642 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006643 else
6644#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006645 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006646
6647 if (done < 0) {
6648 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006650 }
6651
6652 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006654
6655#ifdef NEED_RETRY
6656 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 s += done;
6658 size -= done;
6659 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006660 }
6661#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006662 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663 Py_DECREF(v);
6664 return NULL;
6665 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006666 return (PyObject *)v;
6667}
6668
Alexander Belopolsky40018472011-02-26 01:02:56 +00006669PyObject *
6670PyUnicode_DecodeMBCS(const char *s,
6671 Py_ssize_t size,
6672 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006673{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006674 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6675}
6676
6677/*
6678 * Convert unicode into string object (MBCS).
6679 * Returns 0 if succeed, -1 otherwise.
6680 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681static int
6682encode_mbcs(PyObject **repr,
6683 const Py_UNICODE *p, /* unicode */
6684 int size, /* size of unicode */
6685 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006686{
Victor Stinner554f3f02010-06-16 23:33:54 +00006687 BOOL usedDefaultChar = FALSE;
6688 BOOL *pusedDefaultChar;
6689 int mbcssize;
6690 Py_ssize_t n;
6691 PyObject *exc = NULL;
6692 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006693
6694 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006695
Victor Stinner554f3f02010-06-16 23:33:54 +00006696 /* check and handle 'errors' arg */
6697 if (errors==NULL || strcmp(errors, "strict")==0) {
6698 flags = WC_NO_BEST_FIT_CHARS;
6699 pusedDefaultChar = &usedDefaultChar;
6700 } else if (strcmp(errors, "replace")==0) {
6701 flags = 0;
6702 pusedDefaultChar = NULL;
6703 } else {
6704 PyErr_Format(PyExc_ValueError,
6705 "mbcs encoding does not support errors='%s'",
6706 errors);
6707 return -1;
6708 }
6709
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006710 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006712 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6713 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 if (mbcssize == 0) {
6715 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6716 return -1;
6717 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006718 /* If we used a default char, then we failed! */
6719 if (pusedDefaultChar && *pusedDefaultChar)
6720 goto mbcs_encode_error;
6721 } else {
6722 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006723 }
6724
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 /* Create string object */
6727 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6728 if (*repr == NULL)
6729 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006730 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006731 }
6732 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 /* Extend string object */
6734 n = PyBytes_Size(*repr);
6735 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6736 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006737 }
6738
6739 /* Do the conversion */
6740 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006742 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6743 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6745 return -1;
6746 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006747 if (pusedDefaultChar && *pusedDefaultChar)
6748 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006749 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006751
6752mbcs_encode_error:
6753 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6754 Py_XDECREF(exc);
6755 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006756}
6757
Alexander Belopolsky40018472011-02-26 01:02:56 +00006758PyObject *
6759PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6760 Py_ssize_t size,
6761 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006762{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763 PyObject *repr = NULL;
6764 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006769 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770 else
6771#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006772 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006773
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 Py_XDECREF(repr);
6776 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006777 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778
6779#ifdef NEED_RETRY
6780 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 p += INT_MAX;
6782 size -= INT_MAX;
6783 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006784 }
6785#endif
6786
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006787 return repr;
6788}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006789
Alexander Belopolsky40018472011-02-26 01:02:56 +00006790PyObject *
6791PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006792{
6793 if (!PyUnicode_Check(unicode)) {
6794 PyErr_BadArgument();
6795 return NULL;
6796 }
6797 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 PyUnicode_GET_SIZE(unicode),
6799 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006800}
6801
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802#undef NEED_RETRY
6803
Victor Stinner99b95382011-07-04 14:23:54 +02006804#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006805
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806/* --- Character Mapping Codec -------------------------------------------- */
6807
Alexander Belopolsky40018472011-02-26 01:02:56 +00006808PyObject *
6809PyUnicode_DecodeCharmap(const char *s,
6810 Py_ssize_t size,
6811 PyObject *mapping,
6812 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006814 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006815 Py_ssize_t startinpos;
6816 Py_ssize_t endinpos;
6817 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006818 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 PyUnicodeObject *v;
6820 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006821 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 PyObject *errorHandler = NULL;
6823 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006824 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006825 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006826
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 /* Default to Latin-1 */
6828 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830
6831 v = _PyUnicode_New(size);
6832 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006838 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 mapstring = PyUnicode_AS_UNICODE(mapping);
6840 maplen = PyUnicode_GET_SIZE(mapping);
6841 while (s < e) {
6842 unsigned char ch = *s;
6843 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 if (ch < maplen)
6846 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 if (x == 0xfffe) {
6849 /* undefined mapping */
6850 outpos = p-PyUnicode_AS_UNICODE(v);
6851 startinpos = s-starts;
6852 endinpos = startinpos+1;
6853 if (unicode_decode_call_errorhandler(
6854 errors, &errorHandler,
6855 "charmap", "character maps to <undefined>",
6856 &starts, &e, &startinpos, &endinpos, &exc, &s,
6857 &v, &outpos, &p)) {
6858 goto onError;
6859 }
6860 continue;
6861 }
6862 *p++ = x;
6863 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006864 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006865 }
6866 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006867 while (s < e) {
6868 unsigned char ch = *s;
6869 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006870
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6872 w = PyLong_FromLong((long)ch);
6873 if (w == NULL)
6874 goto onError;
6875 x = PyObject_GetItem(mapping, w);
6876 Py_DECREF(w);
6877 if (x == NULL) {
6878 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6879 /* No mapping found means: mapping is undefined. */
6880 PyErr_Clear();
6881 x = Py_None;
6882 Py_INCREF(x);
6883 } else
6884 goto onError;
6885 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006886
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 /* Apply mapping */
6888 if (PyLong_Check(x)) {
6889 long value = PyLong_AS_LONG(x);
6890 if (value < 0 || value > 65535) {
6891 PyErr_SetString(PyExc_TypeError,
6892 "character mapping must be in range(65536)");
6893 Py_DECREF(x);
6894 goto onError;
6895 }
6896 *p++ = (Py_UNICODE)value;
6897 }
6898 else if (x == Py_None) {
6899 /* undefined mapping */
6900 outpos = p-PyUnicode_AS_UNICODE(v);
6901 startinpos = s-starts;
6902 endinpos = startinpos+1;
6903 if (unicode_decode_call_errorhandler(
6904 errors, &errorHandler,
6905 "charmap", "character maps to <undefined>",
6906 &starts, &e, &startinpos, &endinpos, &exc, &s,
6907 &v, &outpos, &p)) {
6908 Py_DECREF(x);
6909 goto onError;
6910 }
6911 Py_DECREF(x);
6912 continue;
6913 }
6914 else if (PyUnicode_Check(x)) {
6915 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006916
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 if (targetsize == 1)
6918 /* 1-1 mapping */
6919 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 else if (targetsize > 1) {
6922 /* 1-n mapping */
6923 if (targetsize > extrachars) {
6924 /* resize first */
6925 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6926 Py_ssize_t needed = (targetsize - extrachars) + \
6927 (targetsize << 2);
6928 extrachars += needed;
6929 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006930 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 PyUnicode_GET_SIZE(v) + needed) < 0) {
6932 Py_DECREF(x);
6933 goto onError;
6934 }
6935 p = PyUnicode_AS_UNICODE(v) + oldpos;
6936 }
6937 Py_UNICODE_COPY(p,
6938 PyUnicode_AS_UNICODE(x),
6939 targetsize);
6940 p += targetsize;
6941 extrachars -= targetsize;
6942 }
6943 /* 1-0 mapping: skip the character */
6944 }
6945 else {
6946 /* wrong return value */
6947 PyErr_SetString(PyExc_TypeError,
6948 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006949 Py_DECREF(x);
6950 goto onError;
6951 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 Py_DECREF(x);
6953 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 }
6956 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006957 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006959 Py_XDECREF(errorHandler);
6960 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006961 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006962 Py_DECREF(v);
6963 return NULL;
6964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006966
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 Py_XDECREF(errorHandler);
6969 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970 Py_XDECREF(v);
6971 return NULL;
6972}
6973
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006974/* Charmap encoding: the lookup table */
6975
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 PyObject_HEAD
6978 unsigned char level1[32];
6979 int count2, count3;
6980 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006981};
6982
6983static PyObject*
6984encoding_map_size(PyObject *obj, PyObject* args)
6985{
6986 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006987 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006989}
6990
6991static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006992 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 PyDoc_STR("Return the size (in bytes) of this object") },
6994 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006995};
6996
6997static void
6998encoding_map_dealloc(PyObject* o)
6999{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007000 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007001}
7002
7003static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007004 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 "EncodingMap", /*tp_name*/
7006 sizeof(struct encoding_map), /*tp_basicsize*/
7007 0, /*tp_itemsize*/
7008 /* methods */
7009 encoding_map_dealloc, /*tp_dealloc*/
7010 0, /*tp_print*/
7011 0, /*tp_getattr*/
7012 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007013 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 0, /*tp_repr*/
7015 0, /*tp_as_number*/
7016 0, /*tp_as_sequence*/
7017 0, /*tp_as_mapping*/
7018 0, /*tp_hash*/
7019 0, /*tp_call*/
7020 0, /*tp_str*/
7021 0, /*tp_getattro*/
7022 0, /*tp_setattro*/
7023 0, /*tp_as_buffer*/
7024 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7025 0, /*tp_doc*/
7026 0, /*tp_traverse*/
7027 0, /*tp_clear*/
7028 0, /*tp_richcompare*/
7029 0, /*tp_weaklistoffset*/
7030 0, /*tp_iter*/
7031 0, /*tp_iternext*/
7032 encoding_map_methods, /*tp_methods*/
7033 0, /*tp_members*/
7034 0, /*tp_getset*/
7035 0, /*tp_base*/
7036 0, /*tp_dict*/
7037 0, /*tp_descr_get*/
7038 0, /*tp_descr_set*/
7039 0, /*tp_dictoffset*/
7040 0, /*tp_init*/
7041 0, /*tp_alloc*/
7042 0, /*tp_new*/
7043 0, /*tp_free*/
7044 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007045};
7046
7047PyObject*
7048PyUnicode_BuildEncodingMap(PyObject* string)
7049{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007050 PyObject *result;
7051 struct encoding_map *mresult;
7052 int i;
7053 int need_dict = 0;
7054 unsigned char level1[32];
7055 unsigned char level2[512];
7056 unsigned char *mlevel1, *mlevel2, *mlevel3;
7057 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007058 int kind;
7059 void *data;
7060 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007062 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007063 PyErr_BadArgument();
7064 return NULL;
7065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007066 kind = PyUnicode_KIND(string);
7067 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007068 memset(level1, 0xFF, sizeof level1);
7069 memset(level2, 0xFF, sizeof level2);
7070
7071 /* If there isn't a one-to-one mapping of NULL to \0,
7072 or if there are non-BMP characters, we need to use
7073 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007074 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007075 need_dict = 1;
7076 for (i = 1; i < 256; i++) {
7077 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007078 ch = PyUnicode_READ(kind, data, i);
7079 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007080 need_dict = 1;
7081 break;
7082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007083 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007084 /* unmapped character */
7085 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007086 l1 = ch >> 11;
7087 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007088 if (level1[l1] == 0xFF)
7089 level1[l1] = count2++;
7090 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007091 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007092 }
7093
7094 if (count2 >= 0xFF || count3 >= 0xFF)
7095 need_dict = 1;
7096
7097 if (need_dict) {
7098 PyObject *result = PyDict_New();
7099 PyObject *key, *value;
7100 if (!result)
7101 return NULL;
7102 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007104 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007105 if (!key || !value)
7106 goto failed1;
7107 if (PyDict_SetItem(result, key, value) == -1)
7108 goto failed1;
7109 Py_DECREF(key);
7110 Py_DECREF(value);
7111 }
7112 return result;
7113 failed1:
7114 Py_XDECREF(key);
7115 Py_XDECREF(value);
7116 Py_DECREF(result);
7117 return NULL;
7118 }
7119
7120 /* Create a three-level trie */
7121 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7122 16*count2 + 128*count3 - 1);
7123 if (!result)
7124 return PyErr_NoMemory();
7125 PyObject_Init(result, &EncodingMapType);
7126 mresult = (struct encoding_map*)result;
7127 mresult->count2 = count2;
7128 mresult->count3 = count3;
7129 mlevel1 = mresult->level1;
7130 mlevel2 = mresult->level23;
7131 mlevel3 = mresult->level23 + 16*count2;
7132 memcpy(mlevel1, level1, 32);
7133 memset(mlevel2, 0xFF, 16*count2);
7134 memset(mlevel3, 0, 128*count3);
7135 count3 = 0;
7136 for (i = 1; i < 256; i++) {
7137 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007138 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007139 /* unmapped character */
7140 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007141 o1 = PyUnicode_READ(kind, data, i)>>11;
7142 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007143 i2 = 16*mlevel1[o1] + o2;
7144 if (mlevel2[i2] == 0xFF)
7145 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007146 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007147 i3 = 128*mlevel2[i2] + o3;
7148 mlevel3[i3] = i;
7149 }
7150 return result;
7151}
7152
7153static int
7154encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7155{
7156 struct encoding_map *map = (struct encoding_map*)mapping;
7157 int l1 = c>>11;
7158 int l2 = (c>>7) & 0xF;
7159 int l3 = c & 0x7F;
7160 int i;
7161
7162#ifdef Py_UNICODE_WIDE
7163 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007165 }
7166#endif
7167 if (c == 0)
7168 return 0;
7169 /* level 1*/
7170 i = map->level1[l1];
7171 if (i == 0xFF) {
7172 return -1;
7173 }
7174 /* level 2*/
7175 i = map->level23[16*i+l2];
7176 if (i == 0xFF) {
7177 return -1;
7178 }
7179 /* level 3 */
7180 i = map->level23[16*map->count2 + 128*i + l3];
7181 if (i == 0) {
7182 return -1;
7183 }
7184 return i;
7185}
7186
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187/* Lookup the character ch in the mapping. If the character
7188 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007189 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007190static PyObject *
7191charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192{
Christian Heimes217cfd12007-12-02 14:31:20 +00007193 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194 PyObject *x;
7195
7196 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198 x = PyObject_GetItem(mapping, w);
7199 Py_DECREF(w);
7200 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7202 /* No mapping found means: mapping is undefined. */
7203 PyErr_Clear();
7204 x = Py_None;
7205 Py_INCREF(x);
7206 return x;
7207 } else
7208 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007210 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007212 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 long value = PyLong_AS_LONG(x);
7214 if (value < 0 || value > 255) {
7215 PyErr_SetString(PyExc_TypeError,
7216 "character mapping must be in range(256)");
7217 Py_DECREF(x);
7218 return NULL;
7219 }
7220 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007222 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 /* wrong return value */
7226 PyErr_Format(PyExc_TypeError,
7227 "character mapping must return integer, bytes or None, not %.400s",
7228 x->ob_type->tp_name);
7229 Py_DECREF(x);
7230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 }
7232}
7233
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007234static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007235charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007236{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007237 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7238 /* exponentially overallocate to minimize reallocations */
7239 if (requiredsize < 2*outsize)
7240 requiredsize = 2*outsize;
7241 if (_PyBytes_Resize(outobj, requiredsize))
7242 return -1;
7243 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007244}
7245
Benjamin Peterson14339b62009-01-31 16:36:08 +00007246typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007248} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007250 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007251 space is available. Return a new reference to the object that
7252 was put in the output buffer, or Py_None, if the mapping was undefined
7253 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007254 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007255static charmapencode_result
7256charmapencode_output(Py_UNICODE c, PyObject *mapping,
7257 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007259 PyObject *rep;
7260 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007261 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262
Christian Heimes90aa7642007-12-19 02:45:37 +00007263 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007264 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007265 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007266 if (res == -1)
7267 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 if (outsize<requiredsize)
7269 if (charmapencode_resize(outobj, outpos, requiredsize))
7270 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007271 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 outstart[(*outpos)++] = (char)res;
7273 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007274 }
7275
7276 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007277 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007279 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007280 Py_DECREF(rep);
7281 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007282 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 if (PyLong_Check(rep)) {
7284 Py_ssize_t requiredsize = *outpos+1;
7285 if (outsize<requiredsize)
7286 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7287 Py_DECREF(rep);
7288 return enc_EXCEPTION;
7289 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007290 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007293 else {
7294 const char *repchars = PyBytes_AS_STRING(rep);
7295 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7296 Py_ssize_t requiredsize = *outpos+repsize;
7297 if (outsize<requiredsize)
7298 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7299 Py_DECREF(rep);
7300 return enc_EXCEPTION;
7301 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007302 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 memcpy(outstart + *outpos, repchars, repsize);
7304 *outpos += repsize;
7305 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007307 Py_DECREF(rep);
7308 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309}
7310
7311/* handle an error in PyUnicode_EncodeCharmap
7312 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007313static int
7314charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007315 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007317 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007318 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007319{
7320 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007321 Py_ssize_t repsize;
7322 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 Py_UNICODE *uni2;
7324 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007325 Py_ssize_t collstartpos = *inpos;
7326 Py_ssize_t collendpos = *inpos+1;
7327 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007328 char *encoding = "charmap";
7329 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007330 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332 /* find all unencodable characters */
7333 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007334 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007335 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 int res = encoding_map_lookup(p[collendpos], mapping);
7337 if (res != -1)
7338 break;
7339 ++collendpos;
7340 continue;
7341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 rep = charmapencode_lookup(p[collendpos], mapping);
7344 if (rep==NULL)
7345 return -1;
7346 else if (rep!=Py_None) {
7347 Py_DECREF(rep);
7348 break;
7349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007350 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007352 }
7353 /* cache callback name lookup
7354 * (if not done yet, i.e. it's the first error) */
7355 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 if ((errors==NULL) || (!strcmp(errors, "strict")))
7357 *known_errorHandler = 1;
7358 else if (!strcmp(errors, "replace"))
7359 *known_errorHandler = 2;
7360 else if (!strcmp(errors, "ignore"))
7361 *known_errorHandler = 3;
7362 else if (!strcmp(errors, "xmlcharrefreplace"))
7363 *known_errorHandler = 4;
7364 else
7365 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007366 }
7367 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007368 case 1: /* strict */
7369 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7370 return -1;
7371 case 2: /* replace */
7372 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 x = charmapencode_output('?', mapping, res, respos);
7374 if (x==enc_EXCEPTION) {
7375 return -1;
7376 }
7377 else if (x==enc_FAILED) {
7378 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7379 return -1;
7380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007381 }
7382 /* fall through */
7383 case 3: /* ignore */
7384 *inpos = collendpos;
7385 break;
7386 case 4: /* xmlcharrefreplace */
7387 /* generate replacement (temporarily (mis)uses p) */
7388 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 char buffer[2+29+1+1];
7390 char *cp;
7391 sprintf(buffer, "&#%d;", (int)p[collpos]);
7392 for (cp = buffer; *cp; ++cp) {
7393 x = charmapencode_output(*cp, mapping, res, respos);
7394 if (x==enc_EXCEPTION)
7395 return -1;
7396 else if (x==enc_FAILED) {
7397 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7398 return -1;
7399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007400 }
7401 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007402 *inpos = collendpos;
7403 break;
7404 default:
7405 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007406 encoding, reason, p, size, exceptionObject,
7407 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007408 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007410 if (PyBytes_Check(repunicode)) {
7411 /* Directly copy bytes result to output. */
7412 Py_ssize_t outsize = PyBytes_Size(*res);
7413 Py_ssize_t requiredsize;
7414 repsize = PyBytes_Size(repunicode);
7415 requiredsize = *respos + repsize;
7416 if (requiredsize > outsize)
7417 /* Make room for all additional bytes. */
7418 if (charmapencode_resize(res, respos, requiredsize)) {
7419 Py_DECREF(repunicode);
7420 return -1;
7421 }
7422 memcpy(PyBytes_AsString(*res) + *respos,
7423 PyBytes_AsString(repunicode), repsize);
7424 *respos += repsize;
7425 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007426 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007427 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007428 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007429 /* generate replacement */
7430 repsize = PyUnicode_GET_SIZE(repunicode);
7431 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 x = charmapencode_output(*uni2, mapping, res, respos);
7433 if (x==enc_EXCEPTION) {
7434 return -1;
7435 }
7436 else if (x==enc_FAILED) {
7437 Py_DECREF(repunicode);
7438 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7439 return -1;
7440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007441 }
7442 *inpos = newpos;
7443 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007444 }
7445 return 0;
7446}
7447
Alexander Belopolsky40018472011-02-26 01:02:56 +00007448PyObject *
7449PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7450 Py_ssize_t size,
7451 PyObject *mapping,
7452 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007454 /* output object */
7455 PyObject *res = NULL;
7456 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007457 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007458 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007459 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007460 PyObject *errorHandler = NULL;
7461 PyObject *exc = NULL;
7462 /* the following variable is used for caching string comparisons
7463 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7464 * 3=ignore, 4=xmlcharrefreplace */
7465 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
7467 /* Default to Latin-1 */
7468 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007471 /* allocate enough for a simple encoding without
7472 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007473 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007474 if (res == NULL)
7475 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007476 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007479 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 /* try to encode it */
7481 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7482 if (x==enc_EXCEPTION) /* error */
7483 goto onError;
7484 if (x==enc_FAILED) { /* unencodable character */
7485 if (charmap_encoding_error(p, size, &inpos, mapping,
7486 &exc,
7487 &known_errorHandler, &errorHandler, errors,
7488 &res, &respos)) {
7489 goto onError;
7490 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007491 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 else
7493 /* done with this character => adjust input position */
7494 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007497 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007498 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007499 if (_PyBytes_Resize(&res, respos) < 0)
7500 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502 Py_XDECREF(exc);
7503 Py_XDECREF(errorHandler);
7504 return res;
7505
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507 Py_XDECREF(res);
7508 Py_XDECREF(exc);
7509 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 return NULL;
7511}
7512
Alexander Belopolsky40018472011-02-26 01:02:56 +00007513PyObject *
7514PyUnicode_AsCharmapString(PyObject *unicode,
7515 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516{
7517 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 PyErr_BadArgument();
7519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 }
7521 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 PyUnicode_GET_SIZE(unicode),
7523 mapping,
7524 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525}
7526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007528static void
7529make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007530 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007531 Py_ssize_t startpos, Py_ssize_t endpos,
7532 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007535 *exceptionObject = _PyUnicodeTranslateError_Create(
7536 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 }
7538 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7540 goto onError;
7541 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7542 goto onError;
7543 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7544 goto onError;
7545 return;
7546 onError:
7547 Py_DECREF(*exceptionObject);
7548 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 }
7550}
7551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007552/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007553static void
7554raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007555 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007556 Py_ssize_t startpos, Py_ssize_t endpos,
7557 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007558{
7559 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007560 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007561 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007563}
7564
7565/* error handling callback helper:
7566 build arguments, call the callback and check the arguments,
7567 put the result into newpos and return the replacement string, which
7568 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569static PyObject *
7570unicode_translate_call_errorhandler(const char *errors,
7571 PyObject **errorHandler,
7572 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007573 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007574 Py_ssize_t startpos, Py_ssize_t endpos,
7575 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007577 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007579 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007580 PyObject *restuple;
7581 PyObject *resunicode;
7582
7583 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007585 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 }
7588
7589 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007590 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007593
7594 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007596 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007599 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 Py_DECREF(restuple);
7601 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007602 }
7603 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 &resunicode, &i_newpos)) {
7605 Py_DECREF(restuple);
7606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007607 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007609 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007610 else
7611 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007612 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7614 Py_DECREF(restuple);
7615 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007616 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007617 Py_INCREF(resunicode);
7618 Py_DECREF(restuple);
7619 return resunicode;
7620}
7621
7622/* Lookup the character ch in the mapping and put the result in result,
7623 which must be decrefed by the caller.
7624 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007625static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007626charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007627{
Christian Heimes217cfd12007-12-02 14:31:20 +00007628 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007629 PyObject *x;
7630
7631 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 x = PyObject_GetItem(mapping, w);
7634 Py_DECREF(w);
7635 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7637 /* No mapping found means: use 1:1 mapping. */
7638 PyErr_Clear();
7639 *result = NULL;
7640 return 0;
7641 } else
7642 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007643 }
7644 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 *result = x;
7646 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007648 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 long value = PyLong_AS_LONG(x);
7650 long max = PyUnicode_GetMax();
7651 if (value < 0 || value > max) {
7652 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007653 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 Py_DECREF(x);
7655 return -1;
7656 }
7657 *result = x;
7658 return 0;
7659 }
7660 else if (PyUnicode_Check(x)) {
7661 *result = x;
7662 return 0;
7663 }
7664 else {
7665 /* wrong return value */
7666 PyErr_SetString(PyExc_TypeError,
7667 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007668 Py_DECREF(x);
7669 return -1;
7670 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671}
7672/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if not reallocate and adjust various state variables.
7674 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007675static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007676charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007679 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007680 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 /* exponentially overallocate to minimize reallocations */
7682 if (requiredsize < 2 * oldsize)
7683 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7685 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688 }
7689 return 0;
7690}
7691/* lookup the character, put the result in the output string and adjust
7692 various state variables. Return a new reference to the object that
7693 was put in the output buffer in *result, or Py_None, if the mapping was
7694 undefined (in which case no character was written).
7695 The called must decref result.
7696 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007697static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007698charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7699 PyObject *mapping, Py_UCS4 **output,
7700 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007701 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7704 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007708 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 }
7710 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007712 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007714 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007715 }
7716 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717 Py_ssize_t repsize;
7718 if (PyUnicode_READY(*res) == -1)
7719 return -1;
7720 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 if (repsize==1) {
7722 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007723 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 }
7725 else if (repsize!=0) {
7726 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007727 Py_ssize_t requiredsize = *opos +
7728 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 Py_ssize_t i;
7731 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007733 for(i = 0; i < repsize; i++)
7734 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 }
7737 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739 return 0;
7740}
7741
Alexander Belopolsky40018472011-02-26 01:02:56 +00007742PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007743_PyUnicode_TranslateCharmap(PyObject *input,
7744 PyObject *mapping,
7745 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 /* input object */
7748 char *idata;
7749 Py_ssize_t size, i;
7750 int kind;
7751 /* output buffer */
7752 Py_UCS4 *output = NULL;
7753 Py_ssize_t osize;
7754 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007755 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007756 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 char *reason = "character maps to <undefined>";
7758 PyObject *errorHandler = NULL;
7759 PyObject *exc = NULL;
7760 /* the following variable is used for caching string comparisons
7761 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7762 * 3=ignore, 4=xmlcharrefreplace */
7763 int known_errorHandler = -1;
7764
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 PyErr_BadArgument();
7767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007770 if (PyUnicode_READY(input) == -1)
7771 return NULL;
7772 idata = (char*)PyUnicode_DATA(input);
7773 kind = PyUnicode_KIND(input);
7774 size = PyUnicode_GET_LENGTH(input);
7775 i = 0;
7776
7777 if (size == 0) {
7778 Py_INCREF(input);
7779 return input;
7780 }
7781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007782 /* allocate enough for a simple 1:1 translation without
7783 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007784 osize = size;
7785 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7786 opos = 0;
7787 if (output == NULL) {
7788 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007792 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 /* try to encode it */
7794 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007795 if (charmaptranslate_output(input, i, mapping,
7796 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 Py_XDECREF(x);
7798 goto onError;
7799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007800 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 else { /* untranslatable character */
7804 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7805 Py_ssize_t repsize;
7806 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007807 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 Py_ssize_t collstart = i;
7810 Py_ssize_t collend = i+1;
7811 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 while (collend < size) {
7815 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 goto onError;
7817 Py_XDECREF(x);
7818 if (x!=Py_None)
7819 break;
7820 ++collend;
7821 }
7822 /* cache callback name lookup
7823 * (if not done yet, i.e. it's the first error) */
7824 if (known_errorHandler==-1) {
7825 if ((errors==NULL) || (!strcmp(errors, "strict")))
7826 known_errorHandler = 1;
7827 else if (!strcmp(errors, "replace"))
7828 known_errorHandler = 2;
7829 else if (!strcmp(errors, "ignore"))
7830 known_errorHandler = 3;
7831 else if (!strcmp(errors, "xmlcharrefreplace"))
7832 known_errorHandler = 4;
7833 else
7834 known_errorHandler = 0;
7835 }
7836 switch (known_errorHandler) {
7837 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 raise_translate_exception(&exc, input, collstart,
7839 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007840 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 case 2: /* replace */
7842 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 for (coll = collstart; coll<collend; coll++)
7844 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 /* fall through */
7846 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 break;
7849 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 /* generate replacement (temporarily (mis)uses i) */
7851 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 char buffer[2+29+1+1];
7853 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007854 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7855 if (charmaptranslate_makespace(&output, &osize,
7856 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 goto onError;
7858 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 break;
7863 default:
7864 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 reason, input, &exc,
7866 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007867 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 goto onError;
7869 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007870 repsize = PyUnicode_GET_LENGTH(repunicode);
7871 if (charmaptranslate_makespace(&output, &osize,
7872 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 Py_DECREF(repunicode);
7874 goto onError;
7875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 for (uni2 = 0; repsize-->0; ++uni2)
7877 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7878 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007880 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 }
7882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7884 if (!res)
7885 goto onError;
7886 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007887 Py_XDECREF(exc);
7888 Py_XDECREF(errorHandler);
7889 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 Py_XDECREF(exc);
7894 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 return NULL;
7896}
7897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898/* Deprecated. Use PyUnicode_Translate instead. */
7899PyObject *
7900PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7901 Py_ssize_t size,
7902 PyObject *mapping,
7903 const char *errors)
7904{
7905 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7906 if (!unicode)
7907 return NULL;
7908 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7909}
7910
Alexander Belopolsky40018472011-02-26 01:02:56 +00007911PyObject *
7912PyUnicode_Translate(PyObject *str,
7913 PyObject *mapping,
7914 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915{
7916 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007917
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918 str = PyUnicode_FromObject(str);
7919 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007921 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922 Py_DECREF(str);
7923 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007924
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 Py_XDECREF(str);
7927 return NULL;
7928}
Tim Petersced69f82003-09-16 20:30:58 +00007929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930static Py_UCS4
7931fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7932{
7933 /* No need to call PyUnicode_READY(self) because this function is only
7934 called as a callback from fixup() which does it already. */
7935 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7936 const int kind = PyUnicode_KIND(self);
7937 void *data = PyUnicode_DATA(self);
7938 Py_UCS4 maxchar = 0, ch, fixed;
7939 Py_ssize_t i;
7940
7941 for (i = 0; i < len; ++i) {
7942 ch = PyUnicode_READ(kind, data, i);
7943 fixed = 0;
7944 if (ch > 127) {
7945 if (Py_UNICODE_ISSPACE(ch))
7946 fixed = ' ';
7947 else {
7948 const int decimal = Py_UNICODE_TODECIMAL(ch);
7949 if (decimal >= 0)
7950 fixed = '0' + decimal;
7951 }
7952 if (fixed != 0) {
7953 if (fixed > maxchar)
7954 maxchar = fixed;
7955 PyUnicode_WRITE(kind, data, i, fixed);
7956 }
7957 else if (ch > maxchar)
7958 maxchar = ch;
7959 }
7960 else if (ch > maxchar)
7961 maxchar = ch;
7962 }
7963
7964 return maxchar;
7965}
7966
7967PyObject *
7968_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7969{
7970 if (!PyUnicode_Check(unicode)) {
7971 PyErr_BadInternalCall();
7972 return NULL;
7973 }
7974 if (PyUnicode_READY(unicode) == -1)
7975 return NULL;
7976 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7977 /* If the string is already ASCII, just return the same string */
7978 Py_INCREF(unicode);
7979 return unicode;
7980 }
7981 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7982}
7983
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007984PyObject *
7985PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7986 Py_ssize_t length)
7987{
7988 PyObject *result;
7989 Py_UNICODE *p; /* write pointer into result */
7990 Py_ssize_t i;
7991 /* Copy to a new string */
7992 result = (PyObject *)_PyUnicode_New(length);
7993 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7994 if (result == NULL)
7995 return result;
7996 p = PyUnicode_AS_UNICODE(result);
7997 /* Iterate over code points */
7998 for (i = 0; i < length; i++) {
7999 Py_UNICODE ch =s[i];
8000 if (ch > 127) {
8001 int decimal = Py_UNICODE_TODECIMAL(ch);
8002 if (decimal >= 0)
8003 p[i] = '0' + decimal;
8004 }
8005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008006 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8007 Py_DECREF(result);
8008 return NULL;
8009 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008010 return result;
8011}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008012/* --- Decimal Encoder ---------------------------------------------------- */
8013
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014int
8015PyUnicode_EncodeDecimal(Py_UNICODE *s,
8016 Py_ssize_t length,
8017 char *output,
8018 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008019{
8020 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 PyObject *errorHandler = NULL;
8022 PyObject *exc = NULL;
8023 const char *encoding = "decimal";
8024 const char *reason = "invalid decimal Unicode string";
8025 /* the following variable is used for caching string comparisons
8026 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8027 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008028
8029 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 PyErr_BadArgument();
8031 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008032 }
8033
8034 p = s;
8035 end = s + length;
8036 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 register Py_UNICODE ch = *p;
8038 int decimal;
8039 PyObject *repunicode;
8040 Py_ssize_t repsize;
8041 Py_ssize_t newpos;
8042 Py_UNICODE *uni2;
8043 Py_UNICODE *collstart;
8044 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008045
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 ++p;
8049 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 decimal = Py_UNICODE_TODECIMAL(ch);
8052 if (decimal >= 0) {
8053 *output++ = '0' + decimal;
8054 ++p;
8055 continue;
8056 }
8057 if (0 < ch && ch < 256) {
8058 *output++ = (char)ch;
8059 ++p;
8060 continue;
8061 }
8062 /* All other characters are considered unencodable */
8063 collstart = p;
8064 collend = p+1;
8065 while (collend < end) {
8066 if ((0 < *collend && *collend < 256) ||
8067 !Py_UNICODE_ISSPACE(*collend) ||
8068 Py_UNICODE_TODECIMAL(*collend))
8069 break;
8070 }
8071 /* cache callback name lookup
8072 * (if not done yet, i.e. it's the first error) */
8073 if (known_errorHandler==-1) {
8074 if ((errors==NULL) || (!strcmp(errors, "strict")))
8075 known_errorHandler = 1;
8076 else if (!strcmp(errors, "replace"))
8077 known_errorHandler = 2;
8078 else if (!strcmp(errors, "ignore"))
8079 known_errorHandler = 3;
8080 else if (!strcmp(errors, "xmlcharrefreplace"))
8081 known_errorHandler = 4;
8082 else
8083 known_errorHandler = 0;
8084 }
8085 switch (known_errorHandler) {
8086 case 1: /* strict */
8087 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8088 goto onError;
8089 case 2: /* replace */
8090 for (p = collstart; p < collend; ++p)
8091 *output++ = '?';
8092 /* fall through */
8093 case 3: /* ignore */
8094 p = collend;
8095 break;
8096 case 4: /* xmlcharrefreplace */
8097 /* generate replacement (temporarily (mis)uses p) */
8098 for (p = collstart; p < collend; ++p)
8099 output += sprintf(output, "&#%d;", (int)*p);
8100 p = collend;
8101 break;
8102 default:
8103 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8104 encoding, reason, s, length, &exc,
8105 collstart-s, collend-s, &newpos);
8106 if (repunicode == NULL)
8107 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008108 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008109 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008110 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8111 Py_DECREF(repunicode);
8112 goto onError;
8113 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 /* generate replacement */
8115 repsize = PyUnicode_GET_SIZE(repunicode);
8116 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8117 Py_UNICODE ch = *uni2;
8118 if (Py_UNICODE_ISSPACE(ch))
8119 *output++ = ' ';
8120 else {
8121 decimal = Py_UNICODE_TODECIMAL(ch);
8122 if (decimal >= 0)
8123 *output++ = '0' + decimal;
8124 else if (0 < ch && ch < 256)
8125 *output++ = (char)ch;
8126 else {
8127 Py_DECREF(repunicode);
8128 raise_encode_exception(&exc, encoding,
8129 s, length, collstart-s, collend-s, reason);
8130 goto onError;
8131 }
8132 }
8133 }
8134 p = s + newpos;
8135 Py_DECREF(repunicode);
8136 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008137 }
8138 /* 0-terminate the output string */
8139 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 Py_XDECREF(exc);
8141 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008142 return 0;
8143
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 Py_XDECREF(exc);
8146 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008147 return -1;
8148}
8149
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150/* --- Helpers ------------------------------------------------------------ */
8151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152#include "stringlib/ucs1lib.h"
8153#include "stringlib/fastsearch.h"
8154#include "stringlib/partition.h"
8155#include "stringlib/split.h"
8156#include "stringlib/count.h"
8157#include "stringlib/find.h"
8158#include "stringlib/localeutil.h"
8159#include "stringlib/undef.h"
8160
8161#include "stringlib/ucs2lib.h"
8162#include "stringlib/fastsearch.h"
8163#include "stringlib/partition.h"
8164#include "stringlib/split.h"
8165#include "stringlib/count.h"
8166#include "stringlib/find.h"
8167#include "stringlib/localeutil.h"
8168#include "stringlib/undef.h"
8169
8170#include "stringlib/ucs4lib.h"
8171#include "stringlib/fastsearch.h"
8172#include "stringlib/partition.h"
8173#include "stringlib/split.h"
8174#include "stringlib/count.h"
8175#include "stringlib/find.h"
8176#include "stringlib/localeutil.h"
8177#include "stringlib/undef.h"
8178
8179static Py_ssize_t
8180any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8181 const Py_UCS1*, Py_ssize_t,
8182 Py_ssize_t, Py_ssize_t),
8183 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8184 const Py_UCS2*, Py_ssize_t,
8185 Py_ssize_t, Py_ssize_t),
8186 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8187 const Py_UCS4*, Py_ssize_t,
8188 Py_ssize_t, Py_ssize_t),
8189 PyObject* s1, PyObject* s2,
8190 Py_ssize_t start,
8191 Py_ssize_t end)
8192{
8193 int kind1, kind2, kind;
8194 void *buf1, *buf2;
8195 Py_ssize_t len1, len2, result;
8196
8197 kind1 = PyUnicode_KIND(s1);
8198 kind2 = PyUnicode_KIND(s2);
8199 kind = kind1 > kind2 ? kind1 : kind2;
8200 buf1 = PyUnicode_DATA(s1);
8201 buf2 = PyUnicode_DATA(s2);
8202 if (kind1 != kind)
8203 buf1 = _PyUnicode_AsKind(s1, kind);
8204 if (!buf1)
8205 return -2;
8206 if (kind2 != kind)
8207 buf2 = _PyUnicode_AsKind(s2, kind);
8208 if (!buf2) {
8209 if (kind1 != kind) PyMem_Free(buf1);
8210 return -2;
8211 }
8212 len1 = PyUnicode_GET_LENGTH(s1);
8213 len2 = PyUnicode_GET_LENGTH(s2);
8214
8215 switch(kind) {
8216 case PyUnicode_1BYTE_KIND:
8217 result = ucs1(buf1, len1, buf2, len2, start, end);
8218 break;
8219 case PyUnicode_2BYTE_KIND:
8220 result = ucs2(buf1, len1, buf2, len2, start, end);
8221 break;
8222 case PyUnicode_4BYTE_KIND:
8223 result = ucs4(buf1, len1, buf2, len2, start, end);
8224 break;
8225 default:
8226 assert(0); result = -2;
8227 }
8228
8229 if (kind1 != kind)
8230 PyMem_Free(buf1);
8231 if (kind2 != kind)
8232 PyMem_Free(buf2);
8233
8234 return result;
8235}
8236
8237Py_ssize_t
8238_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8239 Py_ssize_t n_buffer,
8240 void *digits, Py_ssize_t n_digits,
8241 Py_ssize_t min_width,
8242 const char *grouping,
8243 const char *thousands_sep)
8244{
8245 switch(kind) {
8246 case PyUnicode_1BYTE_KIND:
8247 return _PyUnicode_ucs1_InsertThousandsGrouping(
8248 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8249 min_width, grouping, thousands_sep);
8250 case PyUnicode_2BYTE_KIND:
8251 return _PyUnicode_ucs2_InsertThousandsGrouping(
8252 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8253 min_width, grouping, thousands_sep);
8254 case PyUnicode_4BYTE_KIND:
8255 return _PyUnicode_ucs4_InsertThousandsGrouping(
8256 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8257 min_width, grouping, thousands_sep);
8258 }
8259 assert(0);
8260 return -1;
8261}
8262
8263
Eric Smith8c663262007-08-25 02:26:07 +00008264#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008265#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008266
Thomas Wouters477c8d52006-05-27 19:21:47 +00008267#include "stringlib/count.h"
8268#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008269
Thomas Wouters477c8d52006-05-27 19:21:47 +00008270/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008271#define ADJUST_INDICES(start, end, len) \
8272 if (end > len) \
8273 end = len; \
8274 else if (end < 0) { \
8275 end += len; \
8276 if (end < 0) \
8277 end = 0; \
8278 } \
8279 if (start < 0) { \
8280 start += len; \
8281 if (start < 0) \
8282 start = 0; \
8283 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008284
Alexander Belopolsky40018472011-02-26 01:02:56 +00008285Py_ssize_t
8286PyUnicode_Count(PyObject *str,
8287 PyObject *substr,
8288 Py_ssize_t start,
8289 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008291 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008292 PyUnicodeObject* str_obj;
8293 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008294 int kind1, kind2, kind;
8295 void *buf1 = NULL, *buf2 = NULL;
8296 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008297
Thomas Wouters477c8d52006-05-27 19:21:47 +00008298 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008301 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008302 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 Py_DECREF(str_obj);
8304 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
Tim Petersced69f82003-09-16 20:30:58 +00008306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008307 kind1 = PyUnicode_KIND(str_obj);
8308 kind2 = PyUnicode_KIND(sub_obj);
8309 kind = kind1 > kind2 ? kind1 : kind2;
8310 buf1 = PyUnicode_DATA(str_obj);
8311 if (kind1 != kind)
8312 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8313 if (!buf1)
8314 goto onError;
8315 buf2 = PyUnicode_DATA(sub_obj);
8316 if (kind2 != kind)
8317 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8318 if (!buf2)
8319 goto onError;
8320 len1 = PyUnicode_GET_LENGTH(str_obj);
8321 len2 = PyUnicode_GET_LENGTH(sub_obj);
8322
8323 ADJUST_INDICES(start, end, len1);
8324 switch(kind) {
8325 case PyUnicode_1BYTE_KIND:
8326 result = ucs1lib_count(
8327 ((Py_UCS1*)buf1) + start, end - start,
8328 buf2, len2, PY_SSIZE_T_MAX
8329 );
8330 break;
8331 case PyUnicode_2BYTE_KIND:
8332 result = ucs2lib_count(
8333 ((Py_UCS2*)buf1) + start, end - start,
8334 buf2, len2, PY_SSIZE_T_MAX
8335 );
8336 break;
8337 case PyUnicode_4BYTE_KIND:
8338 result = ucs4lib_count(
8339 ((Py_UCS4*)buf1) + start, end - start,
8340 buf2, len2, PY_SSIZE_T_MAX
8341 );
8342 break;
8343 default:
8344 assert(0); result = 0;
8345 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346
8347 Py_DECREF(sub_obj);
8348 Py_DECREF(str_obj);
8349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 if (kind1 != kind)
8351 PyMem_Free(buf1);
8352 if (kind2 != kind)
8353 PyMem_Free(buf2);
8354
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 onError:
8357 Py_DECREF(sub_obj);
8358 Py_DECREF(str_obj);
8359 if (kind1 != kind && buf1)
8360 PyMem_Free(buf1);
8361 if (kind2 != kind && buf2)
8362 PyMem_Free(buf2);
8363 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364}
8365
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366Py_ssize_t
8367PyUnicode_Find(PyObject *str,
8368 PyObject *sub,
8369 Py_ssize_t start,
8370 Py_ssize_t end,
8371 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008374
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 Py_DECREF(str);
8381 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
Tim Petersced69f82003-09-16 20:30:58 +00008383
Thomas Wouters477c8d52006-05-27 19:21:47 +00008384 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 result = any_find_slice(
8386 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8387 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008388 );
8389 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 result = any_find_slice(
8391 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8392 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008393 );
8394
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008396 Py_DECREF(sub);
8397
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 return result;
8399}
8400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401Py_ssize_t
8402PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8403 Py_ssize_t start, Py_ssize_t end,
8404 int direction)
8405{
8406 char *result;
8407 int kind;
8408 if (PyUnicode_READY(str) == -1)
8409 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008410 if (start < 0 || end < 0) {
8411 PyErr_SetString(PyExc_IndexError, "string index out of range");
8412 return -2;
8413 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 if (end > PyUnicode_GET_LENGTH(str))
8415 end = PyUnicode_GET_LENGTH(str);
8416 kind = PyUnicode_KIND(str);
8417 result = findchar(PyUnicode_1BYTE_DATA(str)
8418 + PyUnicode_KIND_SIZE(kind, start),
8419 kind,
8420 end-start, ch, direction);
8421 if (!result)
8422 return -1;
8423 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8424}
8425
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426static int
8427tailmatch(PyUnicodeObject *self,
8428 PyUnicodeObject *substring,
8429 Py_ssize_t start,
8430 Py_ssize_t end,
8431 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 int kind_self;
8434 int kind_sub;
8435 void *data_self;
8436 void *data_sub;
8437 Py_ssize_t offset;
8438 Py_ssize_t i;
8439 Py_ssize_t end_sub;
8440
8441 if (PyUnicode_READY(self) == -1 ||
8442 PyUnicode_READY(substring) == -1)
8443 return 0;
8444
8445 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 return 1;
8447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8449 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 kind_self = PyUnicode_KIND(self);
8454 data_self = PyUnicode_DATA(self);
8455 kind_sub = PyUnicode_KIND(substring);
8456 data_sub = PyUnicode_DATA(substring);
8457 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8458
8459 if (direction > 0)
8460 offset = end;
8461 else
8462 offset = start;
8463
8464 if (PyUnicode_READ(kind_self, data_self, offset) ==
8465 PyUnicode_READ(kind_sub, data_sub, 0) &&
8466 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8467 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8468 /* If both are of the same kind, memcmp is sufficient */
8469 if (kind_self == kind_sub) {
8470 return ! memcmp((char *)data_self +
8471 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8472 data_sub,
8473 PyUnicode_GET_LENGTH(substring) *
8474 PyUnicode_CHARACTER_SIZE(substring));
8475 }
8476 /* otherwise we have to compare each character by first accesing it */
8477 else {
8478 /* We do not need to compare 0 and len(substring)-1 because
8479 the if statement above ensured already that they are equal
8480 when we end up here. */
8481 // TODO: honor direction and do a forward or backwards search
8482 for (i = 1; i < end_sub; ++i) {
8483 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8484 PyUnicode_READ(kind_sub, data_sub, i))
8485 return 0;
8486 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489 }
8490
8491 return 0;
8492}
8493
Alexander Belopolsky40018472011-02-26 01:02:56 +00008494Py_ssize_t
8495PyUnicode_Tailmatch(PyObject *str,
8496 PyObject *substr,
8497 Py_ssize_t start,
8498 Py_ssize_t end,
8499 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008501 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008502
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 str = PyUnicode_FromObject(str);
8504 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 substr = PyUnicode_FromObject(substr);
8507 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 Py_DECREF(str);
8509 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 }
Tim Petersced69f82003-09-16 20:30:58 +00008511
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 (PyUnicodeObject *)substr,
8514 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 Py_DECREF(str);
8516 Py_DECREF(substr);
8517 return result;
8518}
8519
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520/* Apply fixfct filter to the Unicode object self and return a
8521 reference to the modified object */
8522
Alexander Belopolsky40018472011-02-26 01:02:56 +00008523static PyObject *
8524fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 PyObject *u;
8528 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 if (PyUnicode_READY(self) == -1)
8531 return NULL;
8532 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8533 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8534 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8539 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 /* fix functions return the new maximum character in a string,
8542 if the kind of the resulting unicode object does not change,
8543 everything is fine. Otherwise we need to change the string kind
8544 and re-run the fix function. */
8545 maxchar_new = fixfct((PyUnicodeObject*)u);
8546 if (maxchar_new == 0)
8547 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8548 else if (maxchar_new <= 127)
8549 maxchar_new = 127;
8550 else if (maxchar_new <= 255)
8551 maxchar_new = 255;
8552 else if (maxchar_new <= 65535)
8553 maxchar_new = 65535;
8554 else
8555 maxchar_new = 1114111; /* 0x10ffff */
8556
8557 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 /* fixfct should return TRUE if it modified the buffer. If
8559 FALSE, return a reference to the original buffer instead
8560 (to save space, not time) */
8561 Py_INCREF(self);
8562 Py_DECREF(u);
8563 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 else if (maxchar_new == maxchar_old) {
8566 return u;
8567 }
8568 else {
8569 /* In case the maximum character changed, we need to
8570 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008571 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 if (v == NULL) {
8573 Py_DECREF(u);
8574 return NULL;
8575 }
8576 if (maxchar_new > maxchar_old) {
8577 /* If the maxchar increased so that the kind changed, not all
8578 characters are representable anymore and we need to fix the
8579 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008580 if (PyUnicode_CopyCharacters(v, 0,
8581 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008582 PyUnicode_GET_LENGTH(self)) < 0)
8583 {
8584 Py_DECREF(u);
8585 return NULL;
8586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 maxchar_old = fixfct((PyUnicodeObject*)v);
8588 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8589 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008590 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008591 if (PyUnicode_CopyCharacters(v, 0,
8592 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008593 PyUnicode_GET_LENGTH(self)) < 0)
8594 {
8595 Py_DECREF(u);
8596 return NULL;
8597 }
8598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599
8600 Py_DECREF(u);
8601 return v;
8602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603}
8604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008606fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 /* No need to call PyUnicode_READY(self) because this function is only
8609 called as a callback from fixup() which does it already. */
8610 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8611 const int kind = PyUnicode_KIND(self);
8612 void *data = PyUnicode_DATA(self);
8613 int touched = 0;
8614 Py_UCS4 maxchar = 0;
8615 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 for (i = 0; i < len; ++i) {
8618 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8619 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8620 if (up != ch) {
8621 if (up > maxchar)
8622 maxchar = up;
8623 PyUnicode_WRITE(kind, data, i, up);
8624 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 else if (ch > maxchar)
8627 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 }
8629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 if (touched)
8631 return maxchar;
8632 else
8633 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634}
8635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008637fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8640 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8641 const int kind = PyUnicode_KIND(self);
8642 void *data = PyUnicode_DATA(self);
8643 int touched = 0;
8644 Py_UCS4 maxchar = 0;
8645 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 for(i = 0; i < len; ++i) {
8648 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8649 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8650 if (lo != ch) {
8651 if (lo > maxchar)
8652 maxchar = lo;
8653 PyUnicode_WRITE(kind, data, i, lo);
8654 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 else if (ch > maxchar)
8657 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 }
8659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 if (touched)
8661 return maxchar;
8662 else
8663 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664}
8665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008667fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8670 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8671 const int kind = PyUnicode_KIND(self);
8672 void *data = PyUnicode_DATA(self);
8673 int touched = 0;
8674 Py_UCS4 maxchar = 0;
8675 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 for(i = 0; i < len; ++i) {
8678 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8679 Py_UCS4 nu = 0;
8680
8681 if (Py_UNICODE_ISUPPER(ch))
8682 nu = Py_UNICODE_TOLOWER(ch);
8683 else if (Py_UNICODE_ISLOWER(ch))
8684 nu = Py_UNICODE_TOUPPER(ch);
8685
8686 if (nu != 0) {
8687 if (nu > maxchar)
8688 maxchar = nu;
8689 PyUnicode_WRITE(kind, data, i, nu);
8690 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 else if (ch > maxchar)
8693 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 }
8695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 if (touched)
8697 return maxchar;
8698 else
8699 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700}
8701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008703fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8706 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8707 const int kind = PyUnicode_KIND(self);
8708 void *data = PyUnicode_DATA(self);
8709 int touched = 0;
8710 Py_UCS4 maxchar = 0;
8711 Py_ssize_t i = 0;
8712 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008713
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008714 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716
8717 ch = PyUnicode_READ(kind, data, i);
8718 if (!Py_UNICODE_ISUPPER(ch)) {
8719 maxchar = Py_UNICODE_TOUPPER(ch);
8720 PyUnicode_WRITE(kind, data, i, maxchar);
8721 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 ++i;
8724 for(; i < len; ++i) {
8725 ch = PyUnicode_READ(kind, data, i);
8726 if (!Py_UNICODE_ISLOWER(ch)) {
8727 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8728 if (lo > maxchar)
8729 maxchar = lo;
8730 PyUnicode_WRITE(kind, data, i, lo);
8731 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 else if (ch > maxchar)
8734 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736
8737 if (touched)
8738 return maxchar;
8739 else
8740 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741}
8742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8747 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8748 const int kind = PyUnicode_KIND(self);
8749 void *data = PyUnicode_DATA(self);
8750 Py_UCS4 maxchar = 0;
8751 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 int previous_is_cased;
8753
8754 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 if (len == 1) {
8756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8757 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8758 if (ti != ch) {
8759 PyUnicode_WRITE(kind, data, i, ti);
8760 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 }
8762 else
8763 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766 for(; i < len; ++i) {
8767 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8768 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008769
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 nu = Py_UNICODE_TOTITLE(ch);
8774
8775 if (nu > maxchar)
8776 maxchar = nu;
8777 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008778
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 if (Py_UNICODE_ISLOWER(ch) ||
8780 Py_UNICODE_ISUPPER(ch) ||
8781 Py_UNICODE_ISTITLE(ch))
8782 previous_is_cased = 1;
8783 else
8784 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787}
8788
Tim Peters8ce9f162004-08-27 01:49:32 +00008789PyObject *
8790PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008793 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008795 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008796 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8797 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008798 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 Py_ssize_t sz, i, res_offset;
8800 Py_UCS4 maxchar = 0;
8801 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802
Tim Peters05eba1f2004-08-27 21:32:02 +00008803 fseq = PySequence_Fast(seq, "");
8804 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008805 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008806 }
8807
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008808 /* NOTE: the following code can't call back into Python code,
8809 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008810 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008811
Tim Peters05eba1f2004-08-27 21:32:02 +00008812 seqlen = PySequence_Fast_GET_SIZE(fseq);
8813 /* If empty sequence, return u"". */
8814 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008816 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008817 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008818 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008819 /* If singleton sequence with an exact Unicode, return that. */
8820 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 item = items[0];
8822 if (PyUnicode_CheckExact(item)) {
8823 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 goto Done;
8826 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008827 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008828 else {
8829 /* Set up sep and seplen */
8830 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 /* fall back to a blank space separator */
8832 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008833 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008835 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008836 else {
8837 if (!PyUnicode_Check(separator)) {
8838 PyErr_Format(PyExc_TypeError,
8839 "separator: expected str instance,"
8840 " %.80s found",
8841 Py_TYPE(separator)->tp_name);
8842 goto onError;
8843 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008844 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 goto onError;
8846 sep = separator;
8847 seplen = PyUnicode_GET_LENGTH(separator);
8848 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8849 /* inc refcount to keep this code path symetric with the
8850 above case of a blank separator */
8851 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008852 }
8853 }
8854
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008855 /* There are at least two things to join, or else we have a subclass
8856 * of str in the sequence.
8857 * Do a pre-pass to figure out the total amount of space we'll
8858 * need (sz), and see whether all argument are strings.
8859 */
8860 sz = 0;
8861 for (i = 0; i < seqlen; i++) {
8862 const Py_ssize_t old_sz = sz;
8863 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 if (!PyUnicode_Check(item)) {
8865 PyErr_Format(PyExc_TypeError,
8866 "sequence item %zd: expected str instance,"
8867 " %.80s found",
8868 i, Py_TYPE(item)->tp_name);
8869 goto onError;
8870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 if (PyUnicode_READY(item) == -1)
8872 goto onError;
8873 sz += PyUnicode_GET_LENGTH(item);
8874 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8875 if (item_maxchar > maxchar)
8876 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008877 if (i != 0)
8878 sz += seplen;
8879 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8880 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008882 goto onError;
8883 }
8884 }
Tim Petersced69f82003-09-16 20:30:58 +00008885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008886 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008887 if (res == NULL)
8888 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008889
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008890 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008892 Py_ssize_t itemlen;
8893 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 /* Copy item, and maybe the separator. */
8896 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008897 if (PyUnicode_CopyCharacters(res, res_offset,
8898 sep, 0, seplen) < 0)
8899 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008902 if (PyUnicode_CopyCharacters(res, res_offset,
8903 item, 0, itemlen) < 0)
8904 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008906 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008908
Benjamin Peterson29060642009-01-31 22:14:21 +00008909 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008910 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 Py_XDECREF(sep);
8912 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008915 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008917 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918 return NULL;
8919}
8920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921#define FILL(kind, data, value, start, length) \
8922 do { \
8923 Py_ssize_t i_ = 0; \
8924 assert(kind != PyUnicode_WCHAR_KIND); \
8925 switch ((kind)) { \
8926 case PyUnicode_1BYTE_KIND: { \
8927 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8928 memset(to_, (unsigned char)value, length); \
8929 break; \
8930 } \
8931 case PyUnicode_2BYTE_KIND: { \
8932 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8933 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8934 break; \
8935 } \
8936 default: { \
8937 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8938 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8939 break; \
8940 } \
8941 } \
8942 } while (0)
8943
Alexander Belopolsky40018472011-02-26 01:02:56 +00008944static PyUnicodeObject *
8945pad(PyUnicodeObject *self,
8946 Py_ssize_t left,
8947 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 PyObject *u;
8951 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008952 int kind;
8953 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954
8955 if (left < 0)
8956 left = 0;
8957 if (right < 0)
8958 right = 0;
8959
Tim Peters7a29bd52001-09-12 03:03:31 +00008960 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961 Py_INCREF(self);
8962 return self;
8963 }
8964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8966 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008967 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8968 return NULL;
8969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8971 if (fill > maxchar)
8972 maxchar = fill;
8973 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008974 if (!u)
8975 return NULL;
8976
8977 kind = PyUnicode_KIND(u);
8978 data = PyUnicode_DATA(u);
8979 if (left)
8980 FILL(kind, data, fill, 0, left);
8981 if (right)
8982 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008983 if (PyUnicode_CopyCharacters(u, left,
8984 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008985 _PyUnicode_LENGTH(self)) < 0)
8986 {
8987 Py_DECREF(u);
8988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 }
8990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994
Alexander Belopolsky40018472011-02-26 01:02:56 +00008995PyObject *
8996PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999
9000 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 switch(PyUnicode_KIND(string)) {
9005 case PyUnicode_1BYTE_KIND:
9006 list = ucs1lib_splitlines(
9007 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9008 PyUnicode_GET_LENGTH(string), keepends);
9009 break;
9010 case PyUnicode_2BYTE_KIND:
9011 list = ucs2lib_splitlines(
9012 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9013 PyUnicode_GET_LENGTH(string), keepends);
9014 break;
9015 case PyUnicode_4BYTE_KIND:
9016 list = ucs4lib_splitlines(
9017 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9018 PyUnicode_GET_LENGTH(string), keepends);
9019 break;
9020 default:
9021 assert(0);
9022 list = 0;
9023 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 Py_DECREF(string);
9025 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026}
9027
Alexander Belopolsky40018472011-02-26 01:02:56 +00009028static PyObject *
9029split(PyUnicodeObject *self,
9030 PyUnicodeObject *substring,
9031 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 int kind1, kind2, kind;
9034 void *buf1, *buf2;
9035 Py_ssize_t len1, len2;
9036 PyObject* out;
9037
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009039 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 if (PyUnicode_READY(self) == -1)
9042 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 if (substring == NULL)
9045 switch(PyUnicode_KIND(self)) {
9046 case PyUnicode_1BYTE_KIND:
9047 return ucs1lib_split_whitespace(
9048 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9049 PyUnicode_GET_LENGTH(self), maxcount
9050 );
9051 case PyUnicode_2BYTE_KIND:
9052 return ucs2lib_split_whitespace(
9053 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9054 PyUnicode_GET_LENGTH(self), maxcount
9055 );
9056 case PyUnicode_4BYTE_KIND:
9057 return ucs4lib_split_whitespace(
9058 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9059 PyUnicode_GET_LENGTH(self), maxcount
9060 );
9061 default:
9062 assert(0);
9063 return NULL;
9064 }
9065
9066 if (PyUnicode_READY(substring) == -1)
9067 return NULL;
9068
9069 kind1 = PyUnicode_KIND(self);
9070 kind2 = PyUnicode_KIND(substring);
9071 kind = kind1 > kind2 ? kind1 : kind2;
9072 buf1 = PyUnicode_DATA(self);
9073 buf2 = PyUnicode_DATA(substring);
9074 if (kind1 != kind)
9075 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9076 if (!buf1)
9077 return NULL;
9078 if (kind2 != kind)
9079 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9080 if (!buf2) {
9081 if (kind1 != kind) PyMem_Free(buf1);
9082 return NULL;
9083 }
9084 len1 = PyUnicode_GET_LENGTH(self);
9085 len2 = PyUnicode_GET_LENGTH(substring);
9086
9087 switch(kind) {
9088 case PyUnicode_1BYTE_KIND:
9089 out = ucs1lib_split(
9090 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9091 break;
9092 case PyUnicode_2BYTE_KIND:
9093 out = ucs2lib_split(
9094 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9095 break;
9096 case PyUnicode_4BYTE_KIND:
9097 out = ucs4lib_split(
9098 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9099 break;
9100 default:
9101 out = NULL;
9102 }
9103 if (kind1 != kind)
9104 PyMem_Free(buf1);
9105 if (kind2 != kind)
9106 PyMem_Free(buf2);
9107 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108}
9109
Alexander Belopolsky40018472011-02-26 01:02:56 +00009110static PyObject *
9111rsplit(PyUnicodeObject *self,
9112 PyUnicodeObject *substring,
9113 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 int kind1, kind2, kind;
9116 void *buf1, *buf2;
9117 Py_ssize_t len1, len2;
9118 PyObject* out;
9119
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009120 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009121 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 if (PyUnicode_READY(self) == -1)
9124 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 if (substring == NULL)
9127 switch(PyUnicode_KIND(self)) {
9128 case PyUnicode_1BYTE_KIND:
9129 return ucs1lib_rsplit_whitespace(
9130 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9131 PyUnicode_GET_LENGTH(self), maxcount
9132 );
9133 case PyUnicode_2BYTE_KIND:
9134 return ucs2lib_rsplit_whitespace(
9135 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9136 PyUnicode_GET_LENGTH(self), maxcount
9137 );
9138 case PyUnicode_4BYTE_KIND:
9139 return ucs4lib_rsplit_whitespace(
9140 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9141 PyUnicode_GET_LENGTH(self), maxcount
9142 );
9143 default:
9144 assert(0);
9145 return NULL;
9146 }
9147
9148 if (PyUnicode_READY(substring) == -1)
9149 return NULL;
9150
9151 kind1 = PyUnicode_KIND(self);
9152 kind2 = PyUnicode_KIND(substring);
9153 kind = kind1 > kind2 ? kind1 : kind2;
9154 buf1 = PyUnicode_DATA(self);
9155 buf2 = PyUnicode_DATA(substring);
9156 if (kind1 != kind)
9157 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9158 if (!buf1)
9159 return NULL;
9160 if (kind2 != kind)
9161 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9162 if (!buf2) {
9163 if (kind1 != kind) PyMem_Free(buf1);
9164 return NULL;
9165 }
9166 len1 = PyUnicode_GET_LENGTH(self);
9167 len2 = PyUnicode_GET_LENGTH(substring);
9168
9169 switch(kind) {
9170 case PyUnicode_1BYTE_KIND:
9171 out = ucs1lib_rsplit(
9172 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9173 break;
9174 case PyUnicode_2BYTE_KIND:
9175 out = ucs2lib_rsplit(
9176 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9177 break;
9178 case PyUnicode_4BYTE_KIND:
9179 out = ucs4lib_rsplit(
9180 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9181 break;
9182 default:
9183 out = NULL;
9184 }
9185 if (kind1 != kind)
9186 PyMem_Free(buf1);
9187 if (kind2 != kind)
9188 PyMem_Free(buf2);
9189 return out;
9190}
9191
9192static Py_ssize_t
9193anylib_find(int kind, void *buf1, Py_ssize_t len1,
9194 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9195{
9196 switch(kind) {
9197 case PyUnicode_1BYTE_KIND:
9198 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9199 case PyUnicode_2BYTE_KIND:
9200 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9201 case PyUnicode_4BYTE_KIND:
9202 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9203 }
9204 assert(0);
9205 return -1;
9206}
9207
9208static Py_ssize_t
9209anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9210 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9211{
9212 switch(kind) {
9213 case PyUnicode_1BYTE_KIND:
9214 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9215 case PyUnicode_2BYTE_KIND:
9216 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9217 case PyUnicode_4BYTE_KIND:
9218 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9219 }
9220 assert(0);
9221 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009222}
9223
Alexander Belopolsky40018472011-02-26 01:02:56 +00009224static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225replace(PyObject *self, PyObject *str1,
9226 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 PyObject *u;
9229 char *sbuf = PyUnicode_DATA(self);
9230 char *buf1 = PyUnicode_DATA(str1);
9231 char *buf2 = PyUnicode_DATA(str2);
9232 int srelease = 0, release1 = 0, release2 = 0;
9233 int skind = PyUnicode_KIND(self);
9234 int kind1 = PyUnicode_KIND(str1);
9235 int kind2 = PyUnicode_KIND(str2);
9236 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9237 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9238 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239
9240 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009241 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009243 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 if (skind < kind1)
9246 /* substring too wide to be present */
9247 goto nothing;
9248
9249 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009250 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009251 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009253 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 Py_UCS4 u1, u2, maxchar;
9257 int mayshrink, rkind;
9258 u1 = PyUnicode_READ_CHAR(str1, 0);
9259 if (!findchar(sbuf, PyUnicode_KIND(self),
9260 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009261 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 u2 = PyUnicode_READ_CHAR(str2, 0);
9263 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9264 /* Replacing u1 with u2 may cause a maxchar reduction in the
9265 result string. */
9266 mayshrink = maxchar > 127;
9267 if (u2 > maxchar) {
9268 maxchar = u2;
9269 mayshrink = 0;
9270 }
9271 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009272 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009274 if (PyUnicode_CopyCharacters(u, 0,
9275 (PyObject*)self, 0, slen) < 0)
9276 {
9277 Py_DECREF(u);
9278 return NULL;
9279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 rkind = PyUnicode_KIND(u);
9281 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9282 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009283 if (--maxcount < 0)
9284 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 if (mayshrink) {
9288 PyObject *tmp = u;
9289 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9290 PyUnicode_GET_LENGTH(tmp));
9291 Py_DECREF(tmp);
9292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 int rkind = skind;
9295 char *res;
9296 if (kind1 < rkind) {
9297 /* widen substring */
9298 buf1 = _PyUnicode_AsKind(str1, rkind);
9299 if (!buf1) goto error;
9300 release1 = 1;
9301 }
9302 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009303 if (i < 0)
9304 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 if (rkind > kind2) {
9306 /* widen replacement */
9307 buf2 = _PyUnicode_AsKind(str2, rkind);
9308 if (!buf2) goto error;
9309 release2 = 1;
9310 }
9311 else if (rkind < kind2) {
9312 /* widen self and buf1 */
9313 rkind = kind2;
9314 if (release1) PyMem_Free(buf1);
9315 sbuf = _PyUnicode_AsKind(self, rkind);
9316 if (!sbuf) goto error;
9317 srelease = 1;
9318 buf1 = _PyUnicode_AsKind(str1, rkind);
9319 if (!buf1) goto error;
9320 release1 = 1;
9321 }
9322 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9323 if (!res) {
9324 PyErr_NoMemory();
9325 goto error;
9326 }
9327 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009328 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9330 buf2,
9331 PyUnicode_KIND_SIZE(rkind, len2));
9332 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009333
9334 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9336 slen-i,
9337 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009338 if (i == -1)
9339 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9341 buf2,
9342 PyUnicode_KIND_SIZE(rkind, len2));
9343 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345
9346 u = PyUnicode_FromKindAndData(rkind, res, slen);
9347 PyMem_Free(res);
9348 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 Py_ssize_t n, i, j, ires;
9353 Py_ssize_t product, new_size;
9354 int rkind = skind;
9355 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 if (kind1 < rkind) {
9358 buf1 = _PyUnicode_AsKind(str1, rkind);
9359 if (!buf1) goto error;
9360 release1 = 1;
9361 }
9362 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009363 if (n == 0)
9364 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 if (kind2 < rkind) {
9366 buf2 = _PyUnicode_AsKind(str2, rkind);
9367 if (!buf2) goto error;
9368 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 else if (kind2 > rkind) {
9371 rkind = kind2;
9372 sbuf = _PyUnicode_AsKind(self, rkind);
9373 if (!sbuf) goto error;
9374 srelease = 1;
9375 if (release1) PyMem_Free(buf1);
9376 buf1 = _PyUnicode_AsKind(str1, rkind);
9377 if (!buf1) goto error;
9378 release1 = 1;
9379 }
9380 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9381 PyUnicode_GET_LENGTH(str1))); */
9382 product = n * (len2-len1);
9383 if ((product / (len2-len1)) != n) {
9384 PyErr_SetString(PyExc_OverflowError,
9385 "replace string is too long");
9386 goto error;
9387 }
9388 new_size = slen + product;
9389 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9390 PyErr_SetString(PyExc_OverflowError,
9391 "replace string is too long");
9392 goto error;
9393 }
9394 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9395 if (!res)
9396 goto error;
9397 ires = i = 0;
9398 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399 while (n-- > 0) {
9400 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 j = anylib_find(rkind,
9402 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9403 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009404 if (j == -1)
9405 break;
9406 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009407 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9409 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9410 PyUnicode_KIND_SIZE(rkind, j-i));
9411 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009412 }
9413 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 if (len2 > 0) {
9415 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9416 buf2,
9417 PyUnicode_KIND_SIZE(rkind, len2));
9418 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009423 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9425 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9426 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009427 } else {
9428 /* interleave */
9429 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9431 buf2,
9432 PyUnicode_KIND_SIZE(rkind, len2));
9433 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009434 if (--n <= 0)
9435 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9437 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9438 PyUnicode_KIND_SIZE(rkind, 1));
9439 ires++;
9440 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9443 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9444 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009447 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (srelease)
9450 PyMem_FREE(sbuf);
9451 if (release1)
9452 PyMem_FREE(buf1);
9453 if (release2)
9454 PyMem_FREE(buf2);
9455 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009456
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009458 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 if (srelease)
9460 PyMem_FREE(sbuf);
9461 if (release1)
9462 PyMem_FREE(buf1);
9463 if (release2)
9464 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009465 if (PyUnicode_CheckExact(self)) {
9466 Py_INCREF(self);
9467 return (PyObject *) self;
9468 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009469 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 error:
9471 if (srelease && sbuf)
9472 PyMem_FREE(sbuf);
9473 if (release1 && buf1)
9474 PyMem_FREE(buf1);
9475 if (release2 && buf2)
9476 PyMem_FREE(buf2);
9477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
9480/* --- Unicode Object Methods --------------------------------------------- */
9481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009482PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484\n\
9485Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009486characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487
9488static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009489unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 return fixup(self, fixtitle);
9492}
9493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009494PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496\n\
9497Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009498have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499
9500static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009501unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 return fixup(self, fixcapitalize);
9504}
9505
9506#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009507PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509\n\
9510Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009511normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512
9513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009514unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515{
9516 PyObject *list;
9517 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009518 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520 /* Split into words */
9521 list = split(self, NULL, -1);
9522 if (!list)
9523 return NULL;
9524
9525 /* Capitalize each word */
9526 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9527 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 if (item == NULL)
9530 goto onError;
9531 Py_DECREF(PyList_GET_ITEM(list, i));
9532 PyList_SET_ITEM(list, i, item);
9533 }
9534
9535 /* Join the words to form a new string */
9536 item = PyUnicode_Join(NULL, list);
9537
Benjamin Peterson29060642009-01-31 22:14:21 +00009538 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 Py_DECREF(list);
9540 return (PyObject *)item;
9541}
9542#endif
9543
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009544/* Argument converter. Coerces to a single unicode character */
9545
9546static int
9547convert_uc(PyObject *obj, void *addr)
9548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009550 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009551
Benjamin Peterson14339b62009-01-31 16:36:08 +00009552 uniobj = PyUnicode_FromObject(obj);
9553 if (uniobj == NULL) {
9554 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009556 return 0;
9557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009559 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009560 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009561 Py_DECREF(uniobj);
9562 return 0;
9563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009565 Py_DECREF(uniobj);
9566 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009567}
9568
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009569PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009572Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009573done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574
9575static PyObject *
9576unicode_center(PyUnicodeObject *self, PyObject *args)
9577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009578 Py_ssize_t marg, left;
9579 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 Py_UCS4 fillchar = ' ';
9581
Victor Stinnere9a29352011-10-01 02:14:59 +02009582 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584
Victor Stinnere9a29352011-10-01 02:14:59 +02009585 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 return NULL;
9587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 Py_INCREF(self);
9590 return (PyObject*) self;
9591 }
9592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594 left = marg / 2 + (marg & width & 1);
9595
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009596 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597}
9598
Marc-André Lemburge5034372000-08-08 08:04:29 +00009599#if 0
9600
9601/* This code should go into some future Unicode collation support
9602 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009603 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009604
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009605/* speedy UTF-16 code point order comparison */
9606/* gleaned from: */
9607/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9608
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009609static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009610{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009611 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009612 0, 0, 0, 0, 0, 0, 0, 0,
9613 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009614 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009615};
9616
Guido van Rossumd57fd912000-03-10 22:53:23 +00009617static int
9618unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9619{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009620 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009621
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622 Py_UNICODE *s1 = str1->str;
9623 Py_UNICODE *s2 = str2->str;
9624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 len1 = str1->_base._base.length;
9626 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009627
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009629 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009630
9631 c1 = *s1++;
9632 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009633
Benjamin Peterson29060642009-01-31 22:14:21 +00009634 if (c1 > (1<<11) * 26)
9635 c1 += utf16Fixup[c1>>11];
9636 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009637 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009638 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009639
9640 if (c1 != c2)
9641 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009642
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009643 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 }
9645
9646 return (len1 < len2) ? -1 : (len1 != len2);
9647}
9648
Marc-André Lemburge5034372000-08-08 08:04:29 +00009649#else
9650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651/* This function assumes that str1 and str2 are readied by the caller. */
9652
Marc-André Lemburge5034372000-08-08 08:04:29 +00009653static int
9654unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 int kind1, kind2;
9657 void *data1, *data2;
9658 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 kind1 = PyUnicode_KIND(str1);
9661 kind2 = PyUnicode_KIND(str2);
9662 data1 = PyUnicode_DATA(str1);
9663 data2 = PyUnicode_DATA(str2);
9664 len1 = PyUnicode_GET_LENGTH(str1);
9665 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 for (i = 0; i < len1 && i < len2; ++i) {
9668 Py_UCS4 c1, c2;
9669 c1 = PyUnicode_READ(kind1, data1, i);
9670 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009671
9672 if (c1 != c2)
9673 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009674 }
9675
9676 return (len1 < len2) ? -1 : (len1 != len2);
9677}
9678
9679#endif
9680
Alexander Belopolsky40018472011-02-26 01:02:56 +00009681int
9682PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9685 if (PyUnicode_READY(left) == -1 ||
9686 PyUnicode_READY(right) == -1)
9687 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009688 return unicode_compare((PyUnicodeObject *)left,
9689 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009691 PyErr_Format(PyExc_TypeError,
9692 "Can't compare %.100s and %.100s",
9693 left->ob_type->tp_name,
9694 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695 return -1;
9696}
9697
Martin v. Löwis5b222132007-06-10 09:51:05 +00009698int
9699PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 Py_ssize_t i;
9702 int kind;
9703 void *data;
9704 Py_UCS4 chr;
9705
Victor Stinner910337b2011-10-03 03:20:16 +02009706 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (PyUnicode_READY(uni) == -1)
9708 return -1;
9709 kind = PyUnicode_KIND(uni);
9710 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009711 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9713 if (chr != str[i])
9714 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009715 /* This check keeps Python strings that end in '\0' from comparing equal
9716 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009719 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009721 return 0;
9722}
9723
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009724
Benjamin Peterson29060642009-01-31 22:14:21 +00009725#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009726 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009727
Alexander Belopolsky40018472011-02-26 01:02:56 +00009728PyObject *
9729PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009730{
9731 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009732
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009733 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9734 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (PyUnicode_READY(left) == -1 ||
9736 PyUnicode_READY(right) == -1)
9737 return NULL;
9738 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9739 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009740 if (op == Py_EQ) {
9741 Py_INCREF(Py_False);
9742 return Py_False;
9743 }
9744 if (op == Py_NE) {
9745 Py_INCREF(Py_True);
9746 return Py_True;
9747 }
9748 }
9749 if (left == right)
9750 result = 0;
9751 else
9752 result = unicode_compare((PyUnicodeObject *)left,
9753 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009754
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009755 /* Convert the return value to a Boolean */
9756 switch (op) {
9757 case Py_EQ:
9758 v = TEST_COND(result == 0);
9759 break;
9760 case Py_NE:
9761 v = TEST_COND(result != 0);
9762 break;
9763 case Py_LE:
9764 v = TEST_COND(result <= 0);
9765 break;
9766 case Py_GE:
9767 v = TEST_COND(result >= 0);
9768 break;
9769 case Py_LT:
9770 v = TEST_COND(result == -1);
9771 break;
9772 case Py_GT:
9773 v = TEST_COND(result == 1);
9774 break;
9775 default:
9776 PyErr_BadArgument();
9777 return NULL;
9778 }
9779 Py_INCREF(v);
9780 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009782
Brian Curtindfc80e32011-08-10 20:28:54 -05009783 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009784}
9785
Alexander Belopolsky40018472011-02-26 01:02:56 +00009786int
9787PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009788{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009789 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 int kind1, kind2, kind;
9791 void *buf1, *buf2;
9792 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009793 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009794
9795 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009796 sub = PyUnicode_FromObject(element);
9797 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009798 PyErr_Format(PyExc_TypeError,
9799 "'in <string>' requires string as left operand, not %s",
9800 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009801 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 if (PyUnicode_READY(sub) == -1)
9804 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009805
Thomas Wouters477c8d52006-05-27 19:21:47 +00009806 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009807 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009808 Py_DECREF(sub);
9809 return -1;
9810 }
9811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 kind1 = PyUnicode_KIND(str);
9813 kind2 = PyUnicode_KIND(sub);
9814 kind = kind1 > kind2 ? kind1 : kind2;
9815 buf1 = PyUnicode_DATA(str);
9816 buf2 = PyUnicode_DATA(sub);
9817 if (kind1 != kind)
9818 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9819 if (!buf1) {
9820 Py_DECREF(sub);
9821 return -1;
9822 }
9823 if (kind2 != kind)
9824 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9825 if (!buf2) {
9826 Py_DECREF(sub);
9827 if (kind1 != kind) PyMem_Free(buf1);
9828 return -1;
9829 }
9830 len1 = PyUnicode_GET_LENGTH(str);
9831 len2 = PyUnicode_GET_LENGTH(sub);
9832
9833 switch(kind) {
9834 case PyUnicode_1BYTE_KIND:
9835 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9836 break;
9837 case PyUnicode_2BYTE_KIND:
9838 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9839 break;
9840 case PyUnicode_4BYTE_KIND:
9841 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9842 break;
9843 default:
9844 result = -1;
9845 assert(0);
9846 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009847
9848 Py_DECREF(str);
9849 Py_DECREF(sub);
9850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 if (kind1 != kind)
9852 PyMem_Free(buf1);
9853 if (kind2 != kind)
9854 PyMem_Free(buf2);
9855
Guido van Rossum403d68b2000-03-13 15:55:09 +00009856 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009857}
9858
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859/* Concat to string or Unicode object giving a new Unicode object. */
9860
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861PyObject *
9862PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 PyObject *u = NULL, *v = NULL, *w;
9865 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866
9867 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009870 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009873 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874
9875 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009876 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009877 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009880 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009881 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883 }
9884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009886 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 w = PyUnicode_New(
9890 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9891 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009893 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009894 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9895 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009896 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009897 v, 0,
9898 PyUnicode_GET_LENGTH(v)) < 0)
9899 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900 Py_DECREF(u);
9901 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 Py_XDECREF(u);
9906 Py_XDECREF(v);
9907 return NULL;
9908}
9909
Walter Dörwald1ab83302007-05-18 17:15:44 +00009910void
Victor Stinner23e56682011-10-03 03:54:37 +02009911PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009912{
Victor Stinner23e56682011-10-03 03:54:37 +02009913 PyObject *left, *res;
9914
9915 if (p_left == NULL) {
9916 if (!PyErr_Occurred())
9917 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009918 return;
9919 }
Victor Stinner23e56682011-10-03 03:54:37 +02009920 left = *p_left;
9921 if (right == NULL || !PyUnicode_Check(left)) {
9922 if (!PyErr_Occurred())
9923 PyErr_BadInternalCall();
9924 goto error;
9925 }
9926
9927 if (PyUnicode_CheckExact(left) && left != unicode_empty
9928 && PyUnicode_CheckExact(right) && right != unicode_empty
9929 && unicode_resizable(left)
9930 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9931 || _PyUnicode_WSTR(left) != NULL))
9932 {
9933 Py_ssize_t u_len, v_len, new_len, copied;
9934
9935 /* FIXME: don't make wstr string ready */
9936 if (PyUnicode_READY(left))
9937 goto error;
9938 if (PyUnicode_READY(right))
9939 goto error;
9940
9941 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9942 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9943 {
9944 u_len = PyUnicode_GET_LENGTH(left);
9945 v_len = PyUnicode_GET_LENGTH(right);
9946 if (u_len > PY_SSIZE_T_MAX - v_len) {
9947 PyErr_SetString(PyExc_OverflowError,
9948 "strings are too large to concat");
9949 goto error;
9950 }
9951 new_len = u_len + v_len;
9952
9953 /* Now we own the last reference to 'left', so we can resize it
9954 * in-place.
9955 */
9956 if (unicode_resize(&left, new_len) != 0) {
9957 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9958 * deallocated so it cannot be put back into
9959 * 'variable'. The MemoryError is raised when there
9960 * is no value in 'variable', which might (very
9961 * remotely) be a cause of incompatibilities.
9962 */
9963 goto error;
9964 }
9965 /* copy 'right' into the newly allocated area of 'left' */
9966 copied = PyUnicode_CopyCharacters(left, u_len,
9967 right, 0,
9968 v_len);
9969 assert(0 <= copied);
9970 *p_left = left;
9971 return;
9972 }
9973 }
9974
9975 res = PyUnicode_Concat(left, right);
9976 if (res == NULL)
9977 goto error;
9978 Py_DECREF(left);
9979 *p_left = res;
9980 return;
9981
9982error:
9983 Py_DECREF(*p_left);
9984 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009985}
9986
9987void
9988PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9989{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009990 PyUnicode_Append(pleft, right);
9991 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009992}
9993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009994PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009997Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009998string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009999interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
10001static PyObject *
10002unicode_count(PyUnicodeObject *self, PyObject *args)
10003{
10004 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010005 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010006 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 int kind1, kind2, kind;
10009 void *buf1, *buf2;
10010 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011
Jesus Ceaac451502011-04-20 17:09:23 +020010012 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10013 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 kind1 = PyUnicode_KIND(self);
10017 kind2 = PyUnicode_KIND(substring);
10018 kind = kind1 > kind2 ? kind1 : kind2;
10019 buf1 = PyUnicode_DATA(self);
10020 buf2 = PyUnicode_DATA(substring);
10021 if (kind1 != kind)
10022 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10023 if (!buf1) {
10024 Py_DECREF(substring);
10025 return NULL;
10026 }
10027 if (kind2 != kind)
10028 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10029 if (!buf2) {
10030 Py_DECREF(substring);
10031 if (kind1 != kind) PyMem_Free(buf1);
10032 return NULL;
10033 }
10034 len1 = PyUnicode_GET_LENGTH(self);
10035 len2 = PyUnicode_GET_LENGTH(substring);
10036
10037 ADJUST_INDICES(start, end, len1);
10038 switch(kind) {
10039 case PyUnicode_1BYTE_KIND:
10040 iresult = ucs1lib_count(
10041 ((Py_UCS1*)buf1) + start, end - start,
10042 buf2, len2, PY_SSIZE_T_MAX
10043 );
10044 break;
10045 case PyUnicode_2BYTE_KIND:
10046 iresult = ucs2lib_count(
10047 ((Py_UCS2*)buf1) + start, end - start,
10048 buf2, len2, PY_SSIZE_T_MAX
10049 );
10050 break;
10051 case PyUnicode_4BYTE_KIND:
10052 iresult = ucs4lib_count(
10053 ((Py_UCS4*)buf1) + start, end - start,
10054 buf2, len2, PY_SSIZE_T_MAX
10055 );
10056 break;
10057 default:
10058 assert(0); iresult = 0;
10059 }
10060
10061 result = PyLong_FromSsize_t(iresult);
10062
10063 if (kind1 != kind)
10064 PyMem_Free(buf1);
10065 if (kind2 != kind)
10066 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067
10068 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010069
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070 return result;
10071}
10072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010073PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010074 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010076Encode S using the codec registered for encoding. Default encoding\n\
10077is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010078handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010079a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10080'xmlcharrefreplace' as well as any other name registered with\n\
10081codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082
10083static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010084unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010086 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087 char *encoding = NULL;
10088 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010089
Benjamin Peterson308d6372009-09-18 21:42:35 +000010090 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10091 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010093 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010094}
10095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010096PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098\n\
10099Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010100If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101
10102static PyObject*
10103unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10104{
10105 Py_UNICODE *e;
10106 Py_UNICODE *p;
10107 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010108 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110 PyUnicodeObject *u;
10111 int tabsize = 8;
10112
10113 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010114 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10117 return NULL;
10118
Thomas Wouters7e474022000-07-16 12:04:32 +000010119 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010120 i = 0; /* chars up to and including most recent \n or \r */
10121 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10123 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010125 if (tabsize > 0) {
10126 incr = tabsize - (j % tabsize); /* cannot overflow */
10127 if (j > PY_SSIZE_T_MAX - incr)
10128 goto overflow1;
10129 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010130 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010133 if (j > PY_SSIZE_T_MAX - 1)
10134 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135 j++;
10136 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 if (i > PY_SSIZE_T_MAX - j)
10138 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010140 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 }
10142 }
10143
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010144 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010145 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010146
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147 /* Second pass: create output string and fill it */
10148 u = _PyUnicode_New(i + j);
10149 if (!u)
10150 return NULL;
10151
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010152 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 q = _PyUnicode_WSTR(u); /* next output char */
10154 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010158 if (tabsize > 0) {
10159 i = tabsize - (j % tabsize);
10160 j += i;
10161 while (i--) {
10162 if (q >= qe)
10163 goto overflow2;
10164 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010165 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010167 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010168 else {
10169 if (q >= qe)
10170 goto overflow2;
10171 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010172 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173 if (*p == '\n' || *p == '\r')
10174 j = 0;
10175 }
10176
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010177 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 Py_DECREF(u);
10179 return NULL;
10180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010182
10183 overflow2:
10184 Py_DECREF(u);
10185 overflow1:
10186 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188}
10189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010190PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192\n\
10193Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010194such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195arguments start and end are interpreted as in slice notation.\n\
10196\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010197Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198
10199static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201{
Jesus Ceaac451502011-04-20 17:09:23 +020010202 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010203 Py_ssize_t start;
10204 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010205 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206
Jesus Ceaac451502011-04-20 17:09:23 +020010207 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10208 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (PyUnicode_READY(self) == -1)
10212 return NULL;
10213 if (PyUnicode_READY(substring) == -1)
10214 return NULL;
10215
10216 result = any_find_slice(
10217 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10218 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010219 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220
10221 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (result == -2)
10224 return NULL;
10225
Christian Heimes217cfd12007-12-02 14:31:20 +000010226 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227}
10228
10229static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010230unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010232 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10233 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236}
10237
Guido van Rossumc2504932007-09-18 19:42:40 +000010238/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010239 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010240static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010241unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Guido van Rossumc2504932007-09-18 19:42:40 +000010243 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010244 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (_PyUnicode_HASH(self) != -1)
10247 return _PyUnicode_HASH(self);
10248 if (PyUnicode_READY(self) == -1)
10249 return -1;
10250 len = PyUnicode_GET_LENGTH(self);
10251
10252 /* The hash function as a macro, gets expanded three times below. */
10253#define HASH(P) \
10254 x = (Py_uhash_t)*P << 7; \
10255 while (--len >= 0) \
10256 x = (1000003*x) ^ (Py_uhash_t)*P++;
10257
10258 switch (PyUnicode_KIND(self)) {
10259 case PyUnicode_1BYTE_KIND: {
10260 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10261 HASH(c);
10262 break;
10263 }
10264 case PyUnicode_2BYTE_KIND: {
10265 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10266 HASH(s);
10267 break;
10268 }
10269 default: {
10270 Py_UCS4 *l;
10271 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10272 "Impossible switch case in unicode_hash");
10273 l = PyUnicode_4BYTE_DATA(self);
10274 HASH(l);
10275 break;
10276 }
10277 }
10278 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10279
Guido van Rossumc2504932007-09-18 19:42:40 +000010280 if (x == -1)
10281 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010283 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010287PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010290Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
10292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010295 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010296 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010297 Py_ssize_t start;
10298 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299
Jesus Ceaac451502011-04-20 17:09:23 +020010300 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10301 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 if (PyUnicode_READY(self) == -1)
10305 return NULL;
10306 if (PyUnicode_READY(substring) == -1)
10307 return NULL;
10308
10309 result = any_find_slice(
10310 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10311 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (result == -2)
10317 return NULL;
10318
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 if (result < 0) {
10320 PyErr_SetString(PyExc_ValueError, "substring not found");
10321 return NULL;
10322 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323
Christian Heimes217cfd12007-12-02 14:31:20 +000010324 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325}
10326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010327PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010328 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010330Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010331at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332
10333static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010334unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 Py_ssize_t i, length;
10337 int kind;
10338 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010339 int cased;
10340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (PyUnicode_READY(self) == -1)
10342 return NULL;
10343 length = PyUnicode_GET_LENGTH(self);
10344 kind = PyUnicode_KIND(self);
10345 data = PyUnicode_DATA(self);
10346
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 if (length == 1)
10349 return PyBool_FromLong(
10350 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010352 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010355
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 for (i = 0; i < length; i++) {
10358 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010359
Benjamin Peterson29060642009-01-31 22:14:21 +000010360 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10361 return PyBool_FromLong(0);
10362 else if (!cased && Py_UNICODE_ISLOWER(ch))
10363 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010365 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366}
10367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010368PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010369 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010371Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010372at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373
10374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010375unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 Py_ssize_t i, length;
10378 int kind;
10379 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380 int cased;
10381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (PyUnicode_READY(self) == -1)
10383 return NULL;
10384 length = PyUnicode_GET_LENGTH(self);
10385 kind = PyUnicode_KIND(self);
10386 data = PyUnicode_DATA(self);
10387
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (length == 1)
10390 return PyBool_FromLong(
10391 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010393 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010395 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010396
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 for (i = 0; i < length; i++) {
10399 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010400
Benjamin Peterson29060642009-01-31 22:14:21 +000010401 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10402 return PyBool_FromLong(0);
10403 else if (!cased && Py_UNICODE_ISUPPER(ch))
10404 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010406 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407}
10408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010412Return True if S is a titlecased string and there is at least one\n\
10413character in S, i.e. upper- and titlecase characters may only\n\
10414follow uncased characters and lowercase characters only cased ones.\n\
10415Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
10417static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010418unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 Py_ssize_t i, length;
10421 int kind;
10422 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423 int cased, previous_is_cased;
10424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 if (PyUnicode_READY(self) == -1)
10426 return NULL;
10427 length = PyUnicode_GET_LENGTH(self);
10428 kind = PyUnicode_KIND(self);
10429 data = PyUnicode_DATA(self);
10430
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (length == 1) {
10433 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10434 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10435 (Py_UNICODE_ISUPPER(ch) != 0));
10436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010438 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010441
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442 cased = 0;
10443 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 for (i = 0; i < length; i++) {
10445 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010446
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10448 if (previous_is_cased)
10449 return PyBool_FromLong(0);
10450 previous_is_cased = 1;
10451 cased = 1;
10452 }
10453 else if (Py_UNICODE_ISLOWER(ch)) {
10454 if (!previous_is_cased)
10455 return PyBool_FromLong(0);
10456 previous_is_cased = 1;
10457 cased = 1;
10458 }
10459 else
10460 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010462 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463}
10464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010465PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010468Return True if all characters in S are whitespace\n\
10469and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
10471static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010472unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 Py_ssize_t i, length;
10475 int kind;
10476 void *data;
10477
10478 if (PyUnicode_READY(self) == -1)
10479 return NULL;
10480 length = PyUnicode_GET_LENGTH(self);
10481 kind = PyUnicode_KIND(self);
10482 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 if (length == 1)
10486 return PyBool_FromLong(
10487 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010489 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 for (i = 0; i < length; i++) {
10494 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010495 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010496 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010498 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499}
10500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010501PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010503\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010504Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010505and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010506
10507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010508unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 Py_ssize_t i, length;
10511 int kind;
10512 void *data;
10513
10514 if (PyUnicode_READY(self) == -1)
10515 return NULL;
10516 length = PyUnicode_GET_LENGTH(self);
10517 kind = PyUnicode_KIND(self);
10518 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010519
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010520 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (length == 1)
10522 return PyBool_FromLong(
10523 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010524
10525 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 for (i = 0; i < length; i++) {
10530 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010531 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010532 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010533 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010534}
10535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010536PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010537 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010538\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010539Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010540and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010541
10542static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010543unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 int kind;
10546 void *data;
10547 Py_ssize_t len, i;
10548
10549 if (PyUnicode_READY(self) == -1)
10550 return NULL;
10551
10552 kind = PyUnicode_KIND(self);
10553 data = PyUnicode_DATA(self);
10554 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010555
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010556 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 if (len == 1) {
10558 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10559 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10560 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010561
10562 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 for (i = 0; i < len; i++) {
10567 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010568 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010570 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010571 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010572}
10573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010574PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010577Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010578False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579
10580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010581unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 Py_ssize_t i, length;
10584 int kind;
10585 void *data;
10586
10587 if (PyUnicode_READY(self) == -1)
10588 return NULL;
10589 length = PyUnicode_GET_LENGTH(self);
10590 kind = PyUnicode_KIND(self);
10591 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (length == 1)
10595 return PyBool_FromLong(
10596 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010598 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010600 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 for (i = 0; i < length; i++) {
10603 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010604 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010606 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607}
10608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010609PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010610 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010612Return True if all characters in S are digits\n\
10613and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
10615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010616unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 Py_ssize_t i, length;
10619 int kind;
10620 void *data;
10621
10622 if (PyUnicode_READY(self) == -1)
10623 return NULL;
10624 length = PyUnicode_GET_LENGTH(self);
10625 kind = PyUnicode_KIND(self);
10626 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (length == 1) {
10630 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10631 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010634 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 for (i = 0; i < length; i++) {
10639 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010642 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643}
10644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010645PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010648Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010649False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010650
10651static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010652unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 Py_ssize_t i, length;
10655 int kind;
10656 void *data;
10657
10658 if (PyUnicode_READY(self) == -1)
10659 return NULL;
10660 length = PyUnicode_GET_LENGTH(self);
10661 kind = PyUnicode_KIND(self);
10662 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (length == 1)
10666 return PyBool_FromLong(
10667 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010669 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 for (i = 0; i < length; i++) {
10674 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010677 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678}
10679
Martin v. Löwis47383402007-08-15 07:32:56 +000010680int
10681PyUnicode_IsIdentifier(PyObject *self)
10682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 int kind;
10684 void *data;
10685 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010686 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (PyUnicode_READY(self) == -1) {
10689 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 }
10692
10693 /* Special case for empty strings */
10694 if (PyUnicode_GET_LENGTH(self) == 0)
10695 return 0;
10696 kind = PyUnicode_KIND(self);
10697 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010698
10699 /* PEP 3131 says that the first character must be in
10700 XID_Start and subsequent characters in XID_Continue,
10701 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010702 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010703 letters, digits, underscore). However, given the current
10704 definition of XID_Start and XID_Continue, it is sufficient
10705 to check just for these, except that _ must be allowed
10706 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010708 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010709 return 0;
10710
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010711 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010714 return 1;
10715}
10716
10717PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010719\n\
10720Return True if S is a valid identifier according\n\
10721to the language definition.");
10722
10723static PyObject*
10724unicode_isidentifier(PyObject *self)
10725{
10726 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10727}
10728
Georg Brandl559e5d72008-06-11 18:37:52 +000010729PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010731\n\
10732Return True if all characters in S are considered\n\
10733printable in repr() or S is empty, False otherwise.");
10734
10735static PyObject*
10736unicode_isprintable(PyObject *self)
10737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 Py_ssize_t i, length;
10739 int kind;
10740 void *data;
10741
10742 if (PyUnicode_READY(self) == -1)
10743 return NULL;
10744 length = PyUnicode_GET_LENGTH(self);
10745 kind = PyUnicode_KIND(self);
10746 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010747
10748 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (length == 1)
10750 return PyBool_FromLong(
10751 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 for (i = 0; i < length; i++) {
10754 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010755 Py_RETURN_FALSE;
10756 }
10757 }
10758 Py_RETURN_TRUE;
10759}
10760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010761PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010762 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763\n\
10764Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010765iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
10767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010768unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010770 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771}
10772
Martin v. Löwis18e16552006-02-15 17:27:45 +000010773static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774unicode_length(PyUnicodeObject *self)
10775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (PyUnicode_READY(self) == -1)
10777 return -1;
10778 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779}
10780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010781PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010782 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010784Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010785done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
10787static PyObject *
10788unicode_ljust(PyUnicodeObject *self, PyObject *args)
10789{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010790 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 Py_UCS4 fillchar = ' ';
10792
10793 if (PyUnicode_READY(self) == -1)
10794 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010795
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010796 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797 return NULL;
10798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 Py_INCREF(self);
10801 return (PyObject*) self;
10802 }
10803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805}
10806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010808 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010810Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811
10812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010813unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815 return fixup(self, fixlower);
10816}
10817
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010818#define LEFTSTRIP 0
10819#define RIGHTSTRIP 1
10820#define BOTHSTRIP 2
10821
10822/* Arrays indexed by above */
10823static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10824
10825#define STRIPNAME(i) (stripformat[i]+3)
10826
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010827/* externally visible for str.strip(unicode) */
10828PyObject *
10829_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10830{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 void *data;
10832 int kind;
10833 Py_ssize_t i, j, len;
10834 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10837 return NULL;
10838
10839 kind = PyUnicode_KIND(self);
10840 data = PyUnicode_DATA(self);
10841 len = PyUnicode_GET_LENGTH(self);
10842 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10843 PyUnicode_DATA(sepobj),
10844 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010845
Benjamin Peterson14339b62009-01-31 16:36:08 +000010846 i = 0;
10847 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 while (i < len &&
10849 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 i++;
10851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010852 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010853
Benjamin Peterson14339b62009-01-31 16:36:08 +000010854 j = len;
10855 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 do {
10857 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 } while (j >= i &&
10859 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010861 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010862
Victor Stinner12bab6d2011-10-01 01:53:49 +020010863 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864}
10865
10866PyObject*
10867PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10868{
10869 unsigned char *data;
10870 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010871 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872
Victor Stinnerde636f32011-10-01 03:55:54 +020010873 if (PyUnicode_READY(self) == -1)
10874 return NULL;
10875
10876 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10877
Victor Stinner12bab6d2011-10-01 01:53:49 +020010878 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010880 if (PyUnicode_CheckExact(self)) {
10881 Py_INCREF(self);
10882 return self;
10883 }
10884 else
10885 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 }
10887
Victor Stinner12bab6d2011-10-01 01:53:49 +020010888 length = end - start;
10889 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010890 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891
Victor Stinnerde636f32011-10-01 03:55:54 +020010892 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010893 PyErr_SetString(PyExc_IndexError, "string index out of range");
10894 return NULL;
10895 }
10896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 kind = PyUnicode_KIND(self);
10898 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010899 return PyUnicode_FromKindAndData(kind,
10900 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010901 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
10904static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010905do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010907 int kind;
10908 void *data;
10909 Py_ssize_t len, i, j;
10910
10911 if (PyUnicode_READY(self) == -1)
10912 return NULL;
10913
10914 kind = PyUnicode_KIND(self);
10915 data = PyUnicode_DATA(self);
10916 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010917
Benjamin Peterson14339b62009-01-31 16:36:08 +000010918 i = 0;
10919 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010921 i++;
10922 }
10923 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010924
Benjamin Peterson14339b62009-01-31 16:36:08 +000010925 j = len;
10926 if (striptype != LEFTSTRIP) {
10927 do {
10928 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010930 j++;
10931 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010932
Victor Stinner12bab6d2011-10-01 01:53:49 +020010933 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934}
10935
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010936
10937static PyObject *
10938do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10939{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010940 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010941
Benjamin Peterson14339b62009-01-31 16:36:08 +000010942 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10943 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010944
Benjamin Peterson14339b62009-01-31 16:36:08 +000010945 if (sep != NULL && sep != Py_None) {
10946 if (PyUnicode_Check(sep))
10947 return _PyUnicode_XStrip(self, striptype, sep);
10948 else {
10949 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 "%s arg must be None or str",
10951 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010952 return NULL;
10953 }
10954 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010955
Benjamin Peterson14339b62009-01-31 16:36:08 +000010956 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010957}
10958
10959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010960PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010961 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010962\n\
10963Return a copy of the string S with leading and trailing\n\
10964whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010965If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010966
10967static PyObject *
10968unicode_strip(PyUnicodeObject *self, PyObject *args)
10969{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010970 if (PyTuple_GET_SIZE(args) == 0)
10971 return do_strip(self, BOTHSTRIP); /* Common case */
10972 else
10973 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010974}
10975
10976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010977PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010978 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010979\n\
10980Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010981If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010982
10983static PyObject *
10984unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10985{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010986 if (PyTuple_GET_SIZE(args) == 0)
10987 return do_strip(self, LEFTSTRIP); /* Common case */
10988 else
10989 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010990}
10991
10992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010993PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010995\n\
10996Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010997If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010998
10999static PyObject *
11000unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11001{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011002 if (PyTuple_GET_SIZE(args) == 0)
11003 return do_strip(self, RIGHTSTRIP); /* Common case */
11004 else
11005 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011006}
11007
11008
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011010unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011{
11012 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014
Georg Brandl222de0f2009-04-12 12:01:50 +000011015 if (len < 1) {
11016 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011017 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
Tim Peters7a29bd52001-09-12 03:03:31 +000011020 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 /* no repeat, return original string */
11022 Py_INCREF(str);
11023 return (PyObject*) str;
11024 }
Tim Peters8f422462000-09-09 06:13:41 +000011025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 if (PyUnicode_READY(str) == -1)
11027 return NULL;
11028
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011029 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011030 PyErr_SetString(PyExc_OverflowError,
11031 "repeated string is too long");
11032 return NULL;
11033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 if (!u)
11038 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011039 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (PyUnicode_GET_LENGTH(str) == 1) {
11042 const int kind = PyUnicode_KIND(str);
11043 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11044 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011045 if (kind == PyUnicode_1BYTE_KIND)
11046 memset(to, (unsigned char)fill_char, len);
11047 else {
11048 for (n = 0; n < len; ++n)
11049 PyUnicode_WRITE(kind, to, n, fill_char);
11050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 }
11052 else {
11053 /* number of characters copied this far */
11054 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11055 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11056 char *to = (char *) PyUnicode_DATA(u);
11057 Py_MEMCPY(to, PyUnicode_DATA(str),
11058 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 n = (done <= nchars-done) ? done : nchars-done;
11061 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011062 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064 }
11065
11066 return (PyObject*) u;
11067}
11068
Alexander Belopolsky40018472011-02-26 01:02:56 +000011069PyObject *
11070PyUnicode_Replace(PyObject *obj,
11071 PyObject *subobj,
11072 PyObject *replobj,
11073 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074{
11075 PyObject *self;
11076 PyObject *str1;
11077 PyObject *str2;
11078 PyObject *result;
11079
11080 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011081 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011084 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 Py_DECREF(self);
11086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 }
11088 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011089 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011090 Py_DECREF(self);
11091 Py_DECREF(str1);
11092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 Py_DECREF(self);
11096 Py_DECREF(str1);
11097 Py_DECREF(str2);
11098 return result;
11099}
11100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011101PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011102 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103\n\
11104Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011105old replaced by new. If the optional argument count is\n\
11106given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107
11108static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 PyObject *str1;
11112 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011113 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 PyObject *result;
11115
Martin v. Löwis18e16552006-02-15 17:27:45 +000011116 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 str1 = PyUnicode_FromObject(str1);
11121 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11122 return NULL;
11123 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011124 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 Py_DECREF(str1);
11126 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128
11129 result = replace(self, str1, str2, maxcount);
11130
11131 Py_DECREF(str1);
11132 Py_DECREF(str2);
11133 return result;
11134}
11135
Alexander Belopolsky40018472011-02-26 01:02:56 +000011136static PyObject *
11137unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011139 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 Py_ssize_t isize;
11141 Py_ssize_t osize, squote, dquote, i, o;
11142 Py_UCS4 max, quote;
11143 int ikind, okind;
11144 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011147 return NULL;
11148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 isize = PyUnicode_GET_LENGTH(unicode);
11150 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 /* Compute length of output, quote characters, and
11153 maximum character */
11154 osize = 2; /* quotes */
11155 max = 127;
11156 squote = dquote = 0;
11157 ikind = PyUnicode_KIND(unicode);
11158 for (i = 0; i < isize; i++) {
11159 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11160 switch (ch) {
11161 case '\'': squote++; osize++; break;
11162 case '"': dquote++; osize++; break;
11163 case '\\': case '\t': case '\r': case '\n':
11164 osize += 2; break;
11165 default:
11166 /* Fast-path ASCII */
11167 if (ch < ' ' || ch == 0x7f)
11168 osize += 4; /* \xHH */
11169 else if (ch < 0x7f)
11170 osize++;
11171 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11172 osize++;
11173 max = ch > max ? ch : max;
11174 }
11175 else if (ch < 0x100)
11176 osize += 4; /* \xHH */
11177 else if (ch < 0x10000)
11178 osize += 6; /* \uHHHH */
11179 else
11180 osize += 10; /* \uHHHHHHHH */
11181 }
11182 }
11183
11184 quote = '\'';
11185 if (squote) {
11186 if (dquote)
11187 /* Both squote and dquote present. Use squote,
11188 and escape them */
11189 osize += squote;
11190 else
11191 quote = '"';
11192 }
11193
11194 repr = PyUnicode_New(osize, max);
11195 if (repr == NULL)
11196 return NULL;
11197 okind = PyUnicode_KIND(repr);
11198 odata = PyUnicode_DATA(repr);
11199
11200 PyUnicode_WRITE(okind, odata, 0, quote);
11201 PyUnicode_WRITE(okind, odata, osize-1, quote);
11202
11203 for (i = 0, o = 1; i < isize; i++) {
11204 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011205
11206 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if ((ch == quote) || (ch == '\\')) {
11208 PyUnicode_WRITE(okind, odata, o++, '\\');
11209 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011210 continue;
11211 }
11212
Benjamin Peterson29060642009-01-31 22:14:21 +000011213 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011214 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 PyUnicode_WRITE(okind, odata, o++, '\\');
11216 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011217 }
11218 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 PyUnicode_WRITE(okind, odata, o++, '\\');
11220 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011221 }
11222 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011223 PyUnicode_WRITE(okind, odata, o++, '\\');
11224 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011225 }
11226
11227 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011228 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 PyUnicode_WRITE(okind, odata, o++, '\\');
11230 PyUnicode_WRITE(okind, odata, o++, 'x');
11231 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11232 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011233 }
11234
Georg Brandl559e5d72008-06-11 18:37:52 +000011235 /* Copy ASCII characters as-is */
11236 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011238 }
11239
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011241 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011242 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011243 (categories Z* and C* except ASCII space)
11244 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011246 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (ch <= 0xff) {
11248 PyUnicode_WRITE(okind, odata, o++, '\\');
11249 PyUnicode_WRITE(okind, odata, o++, 'x');
11250 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11251 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011252 }
11253 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 else if (ch >= 0x10000) {
11255 PyUnicode_WRITE(okind, odata, o++, '\\');
11256 PyUnicode_WRITE(okind, odata, o++, 'U');
11257 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11258 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11259 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11260 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11261 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11262 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11263 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11264 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011265 }
11266 /* Map 16-bit characters to '\uxxxx' */
11267 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 PyUnicode_WRITE(okind, odata, o++, '\\');
11269 PyUnicode_WRITE(okind, odata, o++, 'u');
11270 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11271 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11272 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11273 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011274 }
11275 }
11276 /* Copy characters as-is */
11277 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011279 }
11280 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011283 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284}
11285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011286PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288\n\
11289Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011290such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291arguments start and end are interpreted as in slice notation.\n\
11292\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
11295static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297{
Jesus Ceaac451502011-04-20 17:09:23 +020011298 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011299 Py_ssize_t start;
11300 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011301 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Jesus Ceaac451502011-04-20 17:09:23 +020011303 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11304 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 if (PyUnicode_READY(self) == -1)
11308 return NULL;
11309 if (PyUnicode_READY(substring) == -1)
11310 return NULL;
11311
11312 result = any_find_slice(
11313 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11314 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011315 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316
11317 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (result == -2)
11320 return NULL;
11321
Christian Heimes217cfd12007-12-02 14:31:20 +000011322 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323}
11324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011325PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011328Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329
11330static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332{
Jesus Ceaac451502011-04-20 17:09:23 +020011333 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011334 Py_ssize_t start;
11335 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011336 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
Jesus Ceaac451502011-04-20 17:09:23 +020011338 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11339 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (PyUnicode_READY(self) == -1)
11343 return NULL;
11344 if (PyUnicode_READY(substring) == -1)
11345 return NULL;
11346
11347 result = any_find_slice(
11348 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11349 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011350 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
11352 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 if (result == -2)
11355 return NULL;
11356
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357 if (result < 0) {
11358 PyErr_SetString(PyExc_ValueError, "substring not found");
11359 return NULL;
11360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361
Christian Heimes217cfd12007-12-02 14:31:20 +000011362 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363}
11364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011368Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011369done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371static PyObject *
11372unicode_rjust(PyUnicodeObject *self, PyObject *args)
11373{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011374 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 Py_UCS4 fillchar = ' ';
11376
Victor Stinnere9a29352011-10-01 02:14:59 +020011377 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011379
Victor Stinnere9a29352011-10-01 02:14:59 +020011380 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 return NULL;
11382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384 Py_INCREF(self);
11385 return (PyObject*) self;
11386 }
11387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
Alexander Belopolsky40018472011-02-26 01:02:56 +000011391PyObject *
11392PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393{
11394 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011395
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396 s = PyUnicode_FromObject(s);
11397 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011398 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011399 if (sep != NULL) {
11400 sep = PyUnicode_FromObject(sep);
11401 if (sep == NULL) {
11402 Py_DECREF(s);
11403 return NULL;
11404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405 }
11406
11407 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11408
11409 Py_DECREF(s);
11410 Py_XDECREF(sep);
11411 return result;
11412}
11413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011414PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416\n\
11417Return a list of the words in S, using sep as the\n\
11418delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011419splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011420whitespace string is a separator and empty strings are\n\
11421removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
11423static PyObject*
11424unicode_split(PyUnicodeObject *self, PyObject *args)
11425{
11426 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011427 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430 return NULL;
11431
11432 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438}
11439
Thomas Wouters477c8d52006-05-27 19:21:47 +000011440PyObject *
11441PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11442{
11443 PyObject* str_obj;
11444 PyObject* sep_obj;
11445 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 int kind1, kind2, kind;
11447 void *buf1 = NULL, *buf2 = NULL;
11448 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011449
11450 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011451 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011453 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011455 Py_DECREF(str_obj);
11456 return NULL;
11457 }
11458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 kind1 = PyUnicode_KIND(str_in);
11460 kind2 = PyUnicode_KIND(sep_obj);
11461 kind = kind1 > kind2 ? kind1 : kind2;
11462 buf1 = PyUnicode_DATA(str_in);
11463 if (kind1 != kind)
11464 buf1 = _PyUnicode_AsKind(str_in, kind);
11465 if (!buf1)
11466 goto onError;
11467 buf2 = PyUnicode_DATA(sep_obj);
11468 if (kind2 != kind)
11469 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11470 if (!buf2)
11471 goto onError;
11472 len1 = PyUnicode_GET_LENGTH(str_obj);
11473 len2 = PyUnicode_GET_LENGTH(sep_obj);
11474
11475 switch(PyUnicode_KIND(str_in)) {
11476 case PyUnicode_1BYTE_KIND:
11477 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11478 break;
11479 case PyUnicode_2BYTE_KIND:
11480 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11481 break;
11482 case PyUnicode_4BYTE_KIND:
11483 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11484 break;
11485 default:
11486 assert(0);
11487 out = 0;
11488 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011489
11490 Py_DECREF(sep_obj);
11491 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (kind1 != kind)
11493 PyMem_Free(buf1);
11494 if (kind2 != kind)
11495 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011496
11497 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 onError:
11499 Py_DECREF(sep_obj);
11500 Py_DECREF(str_obj);
11501 if (kind1 != kind && buf1)
11502 PyMem_Free(buf1);
11503 if (kind2 != kind && buf2)
11504 PyMem_Free(buf2);
11505 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011506}
11507
11508
11509PyObject *
11510PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11511{
11512 PyObject* str_obj;
11513 PyObject* sep_obj;
11514 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 int kind1, kind2, kind;
11516 void *buf1 = NULL, *buf2 = NULL;
11517 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011518
11519 str_obj = PyUnicode_FromObject(str_in);
11520 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011522 sep_obj = PyUnicode_FromObject(sep_in);
11523 if (!sep_obj) {
11524 Py_DECREF(str_obj);
11525 return NULL;
11526 }
11527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 kind1 = PyUnicode_KIND(str_in);
11529 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011530 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 buf1 = PyUnicode_DATA(str_in);
11532 if (kind1 != kind)
11533 buf1 = _PyUnicode_AsKind(str_in, kind);
11534 if (!buf1)
11535 goto onError;
11536 buf2 = PyUnicode_DATA(sep_obj);
11537 if (kind2 != kind)
11538 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11539 if (!buf2)
11540 goto onError;
11541 len1 = PyUnicode_GET_LENGTH(str_obj);
11542 len2 = PyUnicode_GET_LENGTH(sep_obj);
11543
11544 switch(PyUnicode_KIND(str_in)) {
11545 case PyUnicode_1BYTE_KIND:
11546 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11547 break;
11548 case PyUnicode_2BYTE_KIND:
11549 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11550 break;
11551 case PyUnicode_4BYTE_KIND:
11552 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11553 break;
11554 default:
11555 assert(0);
11556 out = 0;
11557 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011558
11559 Py_DECREF(sep_obj);
11560 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 if (kind1 != kind)
11562 PyMem_Free(buf1);
11563 if (kind2 != kind)
11564 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011565
11566 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 onError:
11568 Py_DECREF(sep_obj);
11569 Py_DECREF(str_obj);
11570 if (kind1 != kind && buf1)
11571 PyMem_Free(buf1);
11572 if (kind2 != kind && buf2)
11573 PyMem_Free(buf2);
11574 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011575}
11576
11577PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011579\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011580Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011581the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011582found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011583
11584static PyObject*
11585unicode_partition(PyUnicodeObject *self, PyObject *separator)
11586{
11587 return PyUnicode_Partition((PyObject *)self, separator);
11588}
11589
11590PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011591 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011592\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011593Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011594the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011595separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011596
11597static PyObject*
11598unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11599{
11600 return PyUnicode_RPartition((PyObject *)self, separator);
11601}
11602
Alexander Belopolsky40018472011-02-26 01:02:56 +000011603PyObject *
11604PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011605{
11606 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011607
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011608 s = PyUnicode_FromObject(s);
11609 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011610 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 if (sep != NULL) {
11612 sep = PyUnicode_FromObject(sep);
11613 if (sep == NULL) {
11614 Py_DECREF(s);
11615 return NULL;
11616 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011617 }
11618
11619 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11620
11621 Py_DECREF(s);
11622 Py_XDECREF(sep);
11623 return result;
11624}
11625
11626PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011628\n\
11629Return a list of the words in S, using sep as the\n\
11630delimiter string, starting at the end of the string and\n\
11631working to the front. If maxsplit is given, at most maxsplit\n\
11632splits are done. If sep is not specified, any whitespace string\n\
11633is a separator.");
11634
11635static PyObject*
11636unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11637{
11638 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011639 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011640
Martin v. Löwis18e16552006-02-15 17:27:45 +000011641 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011642 return NULL;
11643
11644 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011646 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011648 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011650}
11651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654\n\
11655Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011656Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011657is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011660unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011662 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011663 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011665 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11666 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667 return NULL;
11668
Guido van Rossum86662912000-04-11 15:38:46 +000011669 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670}
11671
11672static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011673PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674{
Walter Dörwald346737f2007-05-31 10:44:43 +000011675 if (PyUnicode_CheckExact(self)) {
11676 Py_INCREF(self);
11677 return self;
11678 } else
11679 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011680 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681}
11682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685\n\
11686Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011687and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
11689static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011690unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 return fixup(self, fixswapcase);
11693}
11694
Georg Brandlceee0772007-11-27 23:48:05 +000011695PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011697\n\
11698Return a translation table usable for str.translate().\n\
11699If there is only one argument, it must be a dictionary mapping Unicode\n\
11700ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011701Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011702If there are two arguments, they must be strings of equal length, and\n\
11703in the resulting dictionary, each character in x will be mapped to the\n\
11704character at the same position in y. If there is a third argument, it\n\
11705must be a string, whose characters will be mapped to None in the result.");
11706
11707static PyObject*
11708unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11709{
11710 PyObject *x, *y = NULL, *z = NULL;
11711 PyObject *new = NULL, *key, *value;
11712 Py_ssize_t i = 0;
11713 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011714
Georg Brandlceee0772007-11-27 23:48:05 +000011715 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11716 return NULL;
11717 new = PyDict_New();
11718 if (!new)
11719 return NULL;
11720 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 int x_kind, y_kind, z_kind;
11722 void *x_data, *y_data, *z_data;
11723
Georg Brandlceee0772007-11-27 23:48:05 +000011724 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011725 if (!PyUnicode_Check(x)) {
11726 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11727 "be a string if there is a second argument");
11728 goto err;
11729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011731 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11732 "arguments must have equal length");
11733 goto err;
11734 }
11735 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 x_kind = PyUnicode_KIND(x);
11737 y_kind = PyUnicode_KIND(y);
11738 x_data = PyUnicode_DATA(x);
11739 y_data = PyUnicode_DATA(y);
11740 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11741 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11742 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011743 if (!key || !value)
11744 goto err;
11745 res = PyDict_SetItem(new, key, value);
11746 Py_DECREF(key);
11747 Py_DECREF(value);
11748 if (res < 0)
11749 goto err;
11750 }
11751 /* create entries for deleting chars in z */
11752 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 z_kind = PyUnicode_KIND(z);
11754 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011755 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011757 if (!key)
11758 goto err;
11759 res = PyDict_SetItem(new, key, Py_None);
11760 Py_DECREF(key);
11761 if (res < 0)
11762 goto err;
11763 }
11764 }
11765 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 int kind;
11767 void *data;
11768
Georg Brandlceee0772007-11-27 23:48:05 +000011769 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011770 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011771 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11772 "to maketrans it must be a dict");
11773 goto err;
11774 }
11775 /* copy entries into the new dict, converting string keys to int keys */
11776 while (PyDict_Next(x, &i, &key, &value)) {
11777 if (PyUnicode_Check(key)) {
11778 /* convert string keys to integer keys */
11779 PyObject *newkey;
11780 if (PyUnicode_GET_SIZE(key) != 1) {
11781 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11782 "table must be of length 1");
11783 goto err;
11784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 kind = PyUnicode_KIND(key);
11786 data = PyUnicode_DATA(key);
11787 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011788 if (!newkey)
11789 goto err;
11790 res = PyDict_SetItem(new, newkey, value);
11791 Py_DECREF(newkey);
11792 if (res < 0)
11793 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011794 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011795 /* just keep integer keys */
11796 if (PyDict_SetItem(new, key, value) < 0)
11797 goto err;
11798 } else {
11799 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11800 "be strings or integers");
11801 goto err;
11802 }
11803 }
11804 }
11805 return new;
11806 err:
11807 Py_DECREF(new);
11808 return NULL;
11809}
11810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011811PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813\n\
11814Return a copy of the string S, where all characters have been mapped\n\
11815through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011816Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011817Unmapped characters are left untouched. Characters mapped to None\n\
11818are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
11820static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824}
11825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011826PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011829Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
11831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011832unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834 return fixup(self, fixupper);
11835}
11836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011837PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011840Pad a numeric string S with zeros on the left, to fill a field\n\
11841of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
11843static PyObject *
11844unicode_zfill(PyUnicodeObject *self, PyObject *args)
11845{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011846 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011848 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 int kind;
11850 void *data;
11851 Py_UCS4 chr;
11852
11853 if (PyUnicode_READY(self) == -1)
11854 return NULL;
11855
Martin v. Löwis18e16552006-02-15 17:27:45 +000011856 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 return NULL;
11858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011860 if (PyUnicode_CheckExact(self)) {
11861 Py_INCREF(self);
11862 return (PyObject*) self;
11863 }
11864 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011865 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 }
11867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869
11870 u = pad(self, fill, 0, '0');
11871
Walter Dörwald068325e2002-04-15 13:36:47 +000011872 if (u == NULL)
11873 return NULL;
11874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 kind = PyUnicode_KIND(u);
11876 data = PyUnicode_DATA(u);
11877 chr = PyUnicode_READ(kind, data, fill);
11878
11879 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 PyUnicode_WRITE(kind, data, 0, chr);
11882 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 }
11884
11885 return (PyObject*) u;
11886}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
11888#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011889static PyObject *
11890unicode__decimal2ascii(PyObject *self)
11891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011893}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894#endif
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011899Return True if S starts with the specified prefix, False otherwise.\n\
11900With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011901With optional end, stop comparing S at that position.\n\
11902prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
11904static PyObject *
11905unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011908 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011910 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011911 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011912 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
Jesus Ceaac451502011-04-20 17:09:23 +020011914 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011915 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011916 if (PyTuple_Check(subobj)) {
11917 Py_ssize_t i;
11918 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11919 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011921 if (substring == NULL)
11922 return NULL;
11923 result = tailmatch(self, substring, start, end, -1);
11924 Py_DECREF(substring);
11925 if (result) {
11926 Py_RETURN_TRUE;
11927 }
11928 }
11929 /* nothing matched */
11930 Py_RETURN_FALSE;
11931 }
11932 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011933 if (substring == NULL) {
11934 if (PyErr_ExceptionMatches(PyExc_TypeError))
11935 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11936 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011938 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011939 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011941 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942}
11943
11944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011945PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011948Return True if S ends with the specified suffix, False otherwise.\n\
11949With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011950With optional end, stop comparing S at that position.\n\
11951suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
11953static PyObject *
11954unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011957 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011959 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011960 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011961 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
Jesus Ceaac451502011-04-20 17:09:23 +020011963 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011965 if (PyTuple_Check(subobj)) {
11966 Py_ssize_t i;
11967 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11968 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011970 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011972 result = tailmatch(self, substring, start, end, +1);
11973 Py_DECREF(substring);
11974 if (result) {
11975 Py_RETURN_TRUE;
11976 }
11977 }
11978 Py_RETURN_FALSE;
11979 }
11980 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011981 if (substring == NULL) {
11982 if (PyErr_ExceptionMatches(PyExc_TypeError))
11983 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11984 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011985 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011986 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011987 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011989 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990}
11991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011993
11994PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011996\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011997Return a formatted version of S, using substitutions from args and kwargs.\n\
11998The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011999
Eric Smith27bbca62010-11-04 17:06:58 +000012000PyDoc_STRVAR(format_map__doc__,
12001 "S.format_map(mapping) -> str\n\
12002\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012003Return a formatted version of S, using substitutions from mapping.\n\
12004The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012005
Eric Smith4a7d76d2008-05-30 18:10:19 +000012006static PyObject *
12007unicode__format__(PyObject* self, PyObject* args)
12008{
12009 PyObject *format_spec;
12010
12011 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12012 return NULL;
12013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12015 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012016}
12017
Eric Smith8c663262007-08-25 02:26:07 +000012018PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012019 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012020\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012021Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012022
12023static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012024unicode__sizeof__(PyUnicodeObject *v)
12025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 Py_ssize_t size;
12027
12028 /* If it's a compact object, account for base structure +
12029 character data. */
12030 if (PyUnicode_IS_COMPACT_ASCII(v))
12031 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12032 else if (PyUnicode_IS_COMPACT(v))
12033 size = sizeof(PyCompactUnicodeObject) +
12034 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12035 else {
12036 /* If it is a two-block object, account for base object, and
12037 for character block if present. */
12038 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012039 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 size += (PyUnicode_GET_LENGTH(v) + 1) *
12041 PyUnicode_CHARACTER_SIZE(v);
12042 }
12043 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012044 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020012046 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012048 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012049 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050
12051 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012052}
12053
12054PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012055 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012056
12057static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012058unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012059{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012060 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 if (!copy)
12062 return NULL;
12063 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012064}
12065
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066static PyMethodDef unicode_methods[] = {
12067
12068 /* Order is according to common usage: often used methods should
12069 appear first, since lookup is done sequentially. */
12070
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012071 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012072 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12073 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012074 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012075 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12076 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12077 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12078 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12079 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12080 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12081 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012082 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012083 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12084 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12085 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012086 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012087 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12088 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12089 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012090 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012091 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012092 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012093 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012094 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12095 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12096 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12097 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12098 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12099 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12100 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12101 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12102 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12103 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12104 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12105 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12106 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12107 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012108 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012109 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012110 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012111 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012112 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012113 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012114 {"maketrans", (PyCFunction) unicode_maketrans,
12115 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012116 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012117#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012118 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119#endif
12120
12121#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012122 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012123 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124#endif
12125
Benjamin Peterson14339b62009-01-31 16:36:08 +000012126 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127 {NULL, NULL}
12128};
12129
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012130static PyObject *
12131unicode_mod(PyObject *v, PyObject *w)
12132{
Brian Curtindfc80e32011-08-10 20:28:54 -050012133 if (!PyUnicode_Check(v))
12134 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012136}
12137
12138static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 0, /*nb_add*/
12140 0, /*nb_subtract*/
12141 0, /*nb_multiply*/
12142 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012143};
12144
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 (lenfunc) unicode_length, /* sq_length */
12147 PyUnicode_Concat, /* sq_concat */
12148 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12149 (ssizeargfunc) unicode_getitem, /* sq_item */
12150 0, /* sq_slice */
12151 0, /* sq_ass_item */
12152 0, /* sq_ass_slice */
12153 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154};
12155
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012156static PyObject*
12157unicode_subscript(PyUnicodeObject* self, PyObject* item)
12158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(self) == -1)
12160 return NULL;
12161
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012162 if (PyIndex_Check(item)) {
12163 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012164 if (i == -1 && PyErr_Occurred())
12165 return NULL;
12166 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012168 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012169 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012170 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012172 Py_UNICODE* result_buf;
12173 PyObject* result;
12174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012177 return NULL;
12178 }
12179
12180 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 return PyUnicode_New(0, 0);
12182 } else if (start == 0 && step == 1 &&
12183 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012184 PyUnicode_CheckExact(self)) {
12185 Py_INCREF(self);
12186 return (PyObject *)self;
12187 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012188 return PyUnicode_Substring((PyObject*)self,
12189 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012190 } else {
12191 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012192 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12193 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 if (result_buf == NULL)
12196 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012197
12198 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12199 result_buf[i] = source_buf[cur];
12200 }
Tim Petersced69f82003-09-16 20:30:58 +000012201
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012202 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012203 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012204 return result;
12205 }
12206 } else {
12207 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12208 return NULL;
12209 }
12210}
12211
12212static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012213 (lenfunc)unicode_length, /* mp_length */
12214 (binaryfunc)unicode_subscript, /* mp_subscript */
12215 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012216};
12217
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219/* Helpers for PyUnicode_Format() */
12220
12221static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012222getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012224 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 (*p_argidx)++;
12227 if (arglen < 0)
12228 return args;
12229 else
12230 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 }
12232 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 return NULL;
12235}
12236
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012237/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012239static PyObject *
12240formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012242 char *p;
12243 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012245
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 x = PyFloat_AsDouble(v);
12247 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012248 return NULL;
12249
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012252
Eric Smith0923d1d2009-04-16 20:16:10 +000012253 p = PyOS_double_to_string(x, type, prec,
12254 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012255 if (p == NULL)
12256 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012258 PyMem_Free(p);
12259 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260}
12261
Tim Peters38fd5b62000-09-21 05:43:11 +000012262static PyObject*
12263formatlong(PyObject *val, int flags, int prec, int type)
12264{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012265 char *buf;
12266 int len;
12267 PyObject *str; /* temporary string object. */
12268 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012269
Benjamin Peterson14339b62009-01-31 16:36:08 +000012270 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12271 if (!str)
12272 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012274 Py_DECREF(str);
12275 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012276}
12277
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012280 size_t buflen,
12281 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012283 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012284 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 if (PyUnicode_GET_LENGTH(v) == 1) {
12286 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 buf[1] = '\0';
12288 return 1;
12289 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012290 goto onError;
12291 }
12292 else {
12293 /* Integer input truncated to a character */
12294 long x;
12295 x = PyLong_AsLong(v);
12296 if (x == -1 && PyErr_Occurred())
12297 goto onError;
12298
12299 if (x < 0 || x > 0x10ffff) {
12300 PyErr_SetString(PyExc_OverflowError,
12301 "%c arg not in range(0x110000)");
12302 return -1;
12303 }
12304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012306 buf[1] = '\0';
12307 return 1;
12308 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012309
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012311 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012313 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012316/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012317 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012318*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012319#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012320
Alexander Belopolsky40018472011-02-26 01:02:56 +000012321PyObject *
12322PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 void *fmt;
12325 int fmtkind;
12326 PyObject *result;
12327 Py_UCS4 *res, *res0;
12328 Py_UCS4 max;
12329 int kind;
12330 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012334
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 PyErr_BadInternalCall();
12337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12340 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 fmt = PyUnicode_DATA(uformat);
12343 fmtkind = PyUnicode_KIND(uformat);
12344 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12345 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346
12347 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12349 if (res0 == NULL) {
12350 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
12354 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 arglen = PyTuple_Size(args);
12356 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 }
12358 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 arglen = -1;
12360 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012362 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012363 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365
12366 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 if (--rescnt < 0) {
12369 rescnt = fmtcnt + 100;
12370 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12372 if (res0 == NULL){
12373 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 }
12376 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012377 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012380 }
12381 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012382 /* Got a format specifier */
12383 int flags = 0;
12384 Py_ssize_t width = -1;
12385 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 Py_UCS4 c = '\0';
12387 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 int isnumok;
12389 PyObject *v = NULL;
12390 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 void *pbuf;
12392 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 Py_ssize_t len, len1;
12395 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 fmtpos++;
12398 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12399 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 Py_ssize_t keylen;
12401 PyObject *key;
12402 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012403
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 if (dict == NULL) {
12405 PyErr_SetString(PyExc_TypeError,
12406 "format requires a mapping");
12407 goto onError;
12408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 /* Skip over balanced parentheses */
12413 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 if (fmtcnt < 0 || pcount > 0) {
12422 PyErr_SetString(PyExc_ValueError,
12423 "incomplete format key");
12424 goto onError;
12425 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012426 key = PyUnicode_Substring((PyObject*)uformat,
12427 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 if (key == NULL)
12429 goto onError;
12430 if (args_owned) {
12431 Py_DECREF(args);
12432 args_owned = 0;
12433 }
12434 args = PyObject_GetItem(dict, key);
12435 Py_DECREF(key);
12436 if (args == NULL) {
12437 goto onError;
12438 }
12439 args_owned = 1;
12440 arglen = -1;
12441 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 case '-': flags |= F_LJUST; continue;
12446 case '+': flags |= F_SIGN; continue;
12447 case ' ': flags |= F_BLANK; continue;
12448 case '#': flags |= F_ALT; continue;
12449 case '0': flags |= F_ZERO; continue;
12450 }
12451 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012452 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 if (c == '*') {
12454 v = getnextarg(args, arglen, &argidx);
12455 if (v == NULL)
12456 goto onError;
12457 if (!PyLong_Check(v)) {
12458 PyErr_SetString(PyExc_TypeError,
12459 "* wants int");
12460 goto onError;
12461 }
12462 width = PyLong_AsLong(v);
12463 if (width == -1 && PyErr_Occurred())
12464 goto onError;
12465 if (width < 0) {
12466 flags |= F_LJUST;
12467 width = -width;
12468 }
12469 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 }
12472 else if (c >= '0' && c <= '9') {
12473 width = c - '0';
12474 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 if (c < '0' || c > '9')
12477 break;
12478 if ((width*10) / 10 != width) {
12479 PyErr_SetString(PyExc_ValueError,
12480 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012481 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 }
12483 width = width*10 + (c - '0');
12484 }
12485 }
12486 if (c == '.') {
12487 prec = 0;
12488 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 if (c == '*') {
12491 v = getnextarg(args, arglen, &argidx);
12492 if (v == NULL)
12493 goto onError;
12494 if (!PyLong_Check(v)) {
12495 PyErr_SetString(PyExc_TypeError,
12496 "* wants int");
12497 goto onError;
12498 }
12499 prec = PyLong_AsLong(v);
12500 if (prec == -1 && PyErr_Occurred())
12501 goto onError;
12502 if (prec < 0)
12503 prec = 0;
12504 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 }
12507 else if (c >= '0' && c <= '9') {
12508 prec = c - '0';
12509 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012511 if (c < '0' || c > '9')
12512 break;
12513 if ((prec*10) / 10 != prec) {
12514 PyErr_SetString(PyExc_ValueError,
12515 "prec too big");
12516 goto onError;
12517 }
12518 prec = prec*10 + (c - '0');
12519 }
12520 }
12521 } /* prec */
12522 if (fmtcnt >= 0) {
12523 if (c == 'h' || c == 'l' || c == 'L') {
12524 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012526 }
12527 }
12528 if (fmtcnt < 0) {
12529 PyErr_SetString(PyExc_ValueError,
12530 "incomplete format");
12531 goto onError;
12532 }
12533 if (c != '%') {
12534 v = getnextarg(args, arglen, &argidx);
12535 if (v == NULL)
12536 goto onError;
12537 }
12538 sign = 0;
12539 fill = ' ';
12540 switch (c) {
12541
12542 case '%':
12543 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 len = 1;
12548 break;
12549
12550 case 's':
12551 case 'r':
12552 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012553 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 temp = v;
12555 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012556 }
12557 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 if (c == 's')
12559 temp = PyObject_Str(v);
12560 else if (c == 'r')
12561 temp = PyObject_Repr(v);
12562 else
12563 temp = PyObject_ASCII(v);
12564 if (temp == NULL)
12565 goto onError;
12566 if (PyUnicode_Check(temp))
12567 /* nothing to do */;
12568 else {
12569 Py_DECREF(temp);
12570 PyErr_SetString(PyExc_TypeError,
12571 "%s argument has non-string str()");
12572 goto onError;
12573 }
12574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 if (PyUnicode_READY(temp) == -1) {
12576 Py_CLEAR(temp);
12577 goto onError;
12578 }
12579 pbuf = PyUnicode_DATA(temp);
12580 kind = PyUnicode_KIND(temp);
12581 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 if (prec >= 0 && len > prec)
12583 len = prec;
12584 break;
12585
12586 case 'i':
12587 case 'd':
12588 case 'u':
12589 case 'o':
12590 case 'x':
12591 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 isnumok = 0;
12593 if (PyNumber_Check(v)) {
12594 PyObject *iobj=NULL;
12595
12596 if (PyLong_Check(v)) {
12597 iobj = v;
12598 Py_INCREF(iobj);
12599 }
12600 else {
12601 iobj = PyNumber_Long(v);
12602 }
12603 if (iobj!=NULL) {
12604 if (PyLong_Check(iobj)) {
12605 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012606 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 Py_DECREF(iobj);
12608 if (!temp)
12609 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 if (PyUnicode_READY(temp) == -1) {
12611 Py_CLEAR(temp);
12612 goto onError;
12613 }
12614 pbuf = PyUnicode_DATA(temp);
12615 kind = PyUnicode_KIND(temp);
12616 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 sign = 1;
12618 }
12619 else {
12620 Py_DECREF(iobj);
12621 }
12622 }
12623 }
12624 if (!isnumok) {
12625 PyErr_Format(PyExc_TypeError,
12626 "%%%c format: a number is required, "
12627 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12628 goto onError;
12629 }
12630 if (flags & F_ZERO)
12631 fill = '0';
12632 break;
12633
12634 case 'e':
12635 case 'E':
12636 case 'f':
12637 case 'F':
12638 case 'g':
12639 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012640 temp = formatfloat(v, flags, prec, c);
12641 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 if (PyUnicode_READY(temp) == -1) {
12644 Py_CLEAR(temp);
12645 goto onError;
12646 }
12647 pbuf = PyUnicode_DATA(temp);
12648 kind = PyUnicode_KIND(temp);
12649 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 sign = 1;
12651 if (flags & F_ZERO)
12652 fill = '0';
12653 break;
12654
12655 case 'c':
12656 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012658 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 if (len < 0)
12660 goto onError;
12661 break;
12662
12663 default:
12664 PyErr_Format(PyExc_ValueError,
12665 "unsupported format character '%c' (0x%x) "
12666 "at index %zd",
12667 (31<=c && c<=126) ? (char)c : '?',
12668 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 goto onError;
12671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 /* pbuf is initialized here. */
12673 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12676 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12677 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 len--;
12679 }
12680 else if (flags & F_SIGN)
12681 sign = '+';
12682 else if (flags & F_BLANK)
12683 sign = ' ';
12684 else
12685 sign = 0;
12686 }
12687 if (width < len)
12688 width = len;
12689 if (rescnt - (sign != 0) < width) {
12690 reslen -= rescnt;
12691 rescnt = width + fmtcnt + 100;
12692 reslen += rescnt;
12693 if (reslen < 0) {
12694 Py_XDECREF(temp);
12695 PyErr_NoMemory();
12696 goto onError;
12697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12699 if (res0 == 0) {
12700 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 Py_XDECREF(temp);
12702 goto onError;
12703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 }
12706 if (sign) {
12707 if (fill != ' ')
12708 *res++ = sign;
12709 rescnt--;
12710 if (width > len)
12711 width--;
12712 }
12713 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12715 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12718 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 }
12720 rescnt -= 2;
12721 width -= 2;
12722 if (width < 0)
12723 width = 0;
12724 len -= 2;
12725 }
12726 if (width > len && !(flags & F_LJUST)) {
12727 do {
12728 --rescnt;
12729 *res++ = fill;
12730 } while (--width > len);
12731 }
12732 if (fill == ' ') {
12733 if (sign)
12734 *res++ = sign;
12735 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12737 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12738 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12739 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012740 }
12741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 /* Copy all characters, preserving len */
12743 len1 = len;
12744 while (len1--) {
12745 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12746 rescnt--;
12747 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 while (--width >= len) {
12749 --rescnt;
12750 *res++ = ' ';
12751 }
12752 if (dict && (argidx < arglen) && c != '%') {
12753 PyErr_SetString(PyExc_TypeError,
12754 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012755 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 goto onError;
12757 }
12758 Py_XDECREF(temp);
12759 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760 } /* until end */
12761 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 PyErr_SetString(PyExc_TypeError,
12763 "not all arguments converted during string formatting");
12764 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765 }
12766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767
12768 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12769 if (*res > max)
12770 max = *res;
12771 result = PyUnicode_New(reslen - rescnt, max);
12772 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 kind = PyUnicode_KIND(result);
12775 for (res = res0; res < res0+reslen-rescnt; res++)
12776 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12777 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012779 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780 }
12781 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782 return (PyObject *)result;
12783
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786 Py_DECREF(uformat);
12787 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012788 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789 }
12790 return NULL;
12791}
12792
Jeremy Hylton938ace62002-07-17 16:30:39 +000012793static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012794unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12795
Tim Peters6d6c1a32001-08-02 04:15:00 +000012796static PyObject *
12797unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12798{
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012800 static char *kwlist[] = {"object", "encoding", "errors", 0};
12801 char *encoding = NULL;
12802 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012803
Benjamin Peterson14339b62009-01-31 16:36:08 +000012804 if (type != &PyUnicode_Type)
12805 return unicode_subtype_new(type, args, kwds);
12806 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 return NULL;
12809 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012811 if (encoding == NULL && errors == NULL)
12812 return PyObject_Str(x);
12813 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012815}
12816
Guido van Rossume023fe02001-08-30 03:12:59 +000012817static PyObject *
12818unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12819{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012820 PyUnicodeObject *unicode, *self;
12821 Py_ssize_t length, char_size;
12822 int share_wstr, share_utf8;
12823 unsigned int kind;
12824 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012825
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012827
12828 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12829 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012831 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012832 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012833 return NULL;
12834
12835 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12836 if (self == NULL) {
12837 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012838 return NULL;
12839 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012840 kind = PyUnicode_KIND(unicode);
12841 length = PyUnicode_GET_LENGTH(unicode);
12842
12843 _PyUnicode_LENGTH(self) = length;
12844 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12845 _PyUnicode_STATE(self).interned = 0;
12846 _PyUnicode_STATE(self).kind = kind;
12847 _PyUnicode_STATE(self).compact = 0;
12848 _PyUnicode_STATE(self).ascii = 0;
12849 _PyUnicode_STATE(self).ready = 1;
12850 _PyUnicode_WSTR(self) = NULL;
12851 _PyUnicode_UTF8_LENGTH(self) = 0;
12852 _PyUnicode_UTF8(self) = NULL;
12853 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012854 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012855
12856 share_utf8 = 0;
12857 share_wstr = 0;
12858 if (kind == PyUnicode_1BYTE_KIND) {
12859 char_size = 1;
12860 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12861 share_utf8 = 1;
12862 }
12863 else if (kind == PyUnicode_2BYTE_KIND) {
12864 char_size = 2;
12865 if (sizeof(wchar_t) == 2)
12866 share_wstr = 1;
12867 }
12868 else {
12869 assert(kind == PyUnicode_4BYTE_KIND);
12870 char_size = 4;
12871 if (sizeof(wchar_t) == 4)
12872 share_wstr = 1;
12873 }
12874
12875 /* Ensure we won't overflow the length. */
12876 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12877 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012880 data = PyObject_MALLOC((length + 1) * char_size);
12881 if (data == NULL) {
12882 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 goto onError;
12884 }
12885
Victor Stinnerc3c74152011-10-02 20:39:55 +020012886 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012887 if (share_utf8) {
12888 _PyUnicode_UTF8_LENGTH(self) = length;
12889 _PyUnicode_UTF8(self) = data;
12890 }
12891 if (share_wstr) {
12892 _PyUnicode_WSTR_LENGTH(self) = length;
12893 _PyUnicode_WSTR(self) = (wchar_t *)data;
12894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012896 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12897 PyUnicode_KIND_SIZE(kind, length + 1));
12898 Py_DECREF(unicode);
12899 return (PyObject *)self;
12900
12901onError:
12902 Py_DECREF(unicode);
12903 Py_DECREF(self);
12904 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012905}
12906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012907PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012909\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012910Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012911encoding defaults to the current default string encoding.\n\
12912errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012913
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012914static PyObject *unicode_iter(PyObject *seq);
12915
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012917 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012918 "str", /* tp_name */
12919 sizeof(PyUnicodeObject), /* tp_size */
12920 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012922 (destructor)unicode_dealloc, /* tp_dealloc */
12923 0, /* tp_print */
12924 0, /* tp_getattr */
12925 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012926 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012927 unicode_repr, /* tp_repr */
12928 &unicode_as_number, /* tp_as_number */
12929 &unicode_as_sequence, /* tp_as_sequence */
12930 &unicode_as_mapping, /* tp_as_mapping */
12931 (hashfunc) unicode_hash, /* tp_hash*/
12932 0, /* tp_call*/
12933 (reprfunc) unicode_str, /* tp_str */
12934 PyObject_GenericGetAttr, /* tp_getattro */
12935 0, /* tp_setattro */
12936 0, /* tp_as_buffer */
12937 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012939 unicode_doc, /* tp_doc */
12940 0, /* tp_traverse */
12941 0, /* tp_clear */
12942 PyUnicode_RichCompare, /* tp_richcompare */
12943 0, /* tp_weaklistoffset */
12944 unicode_iter, /* tp_iter */
12945 0, /* tp_iternext */
12946 unicode_methods, /* tp_methods */
12947 0, /* tp_members */
12948 0, /* tp_getset */
12949 &PyBaseObject_Type, /* tp_base */
12950 0, /* tp_dict */
12951 0, /* tp_descr_get */
12952 0, /* tp_descr_set */
12953 0, /* tp_dictoffset */
12954 0, /* tp_init */
12955 0, /* tp_alloc */
12956 unicode_new, /* tp_new */
12957 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958};
12959
12960/* Initialize the Unicode implementation */
12961
Thomas Wouters78890102000-07-22 19:25:51 +000012962void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012964 int i;
12965
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968 0x000A, /* LINE FEED */
12969 0x000D, /* CARRIAGE RETURN */
12970 0x001C, /* FILE SEPARATOR */
12971 0x001D, /* GROUP SEPARATOR */
12972 0x001E, /* RECORD SEPARATOR */
12973 0x0085, /* NEXT LINE */
12974 0x2028, /* LINE SEPARATOR */
12975 0x2029, /* PARAGRAPH SEPARATOR */
12976 };
12977
Fred Drakee4315f52000-05-09 19:53:39 +000012978 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012979 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012980 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012982
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012983 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012985 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987
12988 /* initialize the linebreak bloom filter */
12989 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012991 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012992
12993 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994}
12995
12996/* Finalize the Unicode implementation */
12997
Christian Heimesa156e092008-02-16 07:38:31 +000012998int
12999PyUnicode_ClearFreeList(void)
13000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013002}
13003
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004void
Thomas Wouters78890102000-07-22 19:25:51 +000013005_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013007 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013009 Py_XDECREF(unicode_empty);
13010 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013011
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013012 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 if (unicode_latin1[i]) {
13014 Py_DECREF(unicode_latin1[i]);
13015 unicode_latin1[i] = NULL;
13016 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013017 }
Christian Heimesa156e092008-02-16 07:38:31 +000013018 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013020
Walter Dörwald16807132007-05-25 13:52:07 +000013021void
13022PyUnicode_InternInPlace(PyObject **p)
13023{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013024 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13025 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013026#ifdef Py_DEBUG
13027 assert(s != NULL);
13028 assert(_PyUnicode_CHECK(s));
13029#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013030 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013031 return;
13032#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033 /* If it's a subclass, we don't really know what putting
13034 it in the interned dict might do. */
13035 if (!PyUnicode_CheckExact(s))
13036 return;
13037 if (PyUnicode_CHECK_INTERNED(s))
13038 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013039 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013040 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 return;
13042 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013043 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013044 if (interned == NULL) {
13045 interned = PyDict_New();
13046 if (interned == NULL) {
13047 PyErr_Clear(); /* Don't leave an exception */
13048 return;
13049 }
13050 }
13051 /* It might be that the GetItem call fails even
13052 though the key is present in the dictionary,
13053 namely when this happens during a stack overflow. */
13054 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013056 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013057
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 if (t) {
13059 Py_INCREF(t);
13060 Py_DECREF(*p);
13061 *p = t;
13062 return;
13063 }
Walter Dörwald16807132007-05-25 13:52:07 +000013064
Benjamin Peterson14339b62009-01-31 16:36:08 +000013065 PyThreadState_GET()->recursion_critical = 1;
13066 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13067 PyErr_Clear();
13068 PyThreadState_GET()->recursion_critical = 0;
13069 return;
13070 }
13071 PyThreadState_GET()->recursion_critical = 0;
13072 /* The two references in interned are not counted by refcnt.
13073 The deallocator will take care of this */
13074 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013075 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013076}
13077
13078void
13079PyUnicode_InternImmortal(PyObject **p)
13080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13082
Benjamin Peterson14339b62009-01-31 16:36:08 +000013083 PyUnicode_InternInPlace(p);
13084 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013086 Py_INCREF(*p);
13087 }
Walter Dörwald16807132007-05-25 13:52:07 +000013088}
13089
13090PyObject *
13091PyUnicode_InternFromString(const char *cp)
13092{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 PyObject *s = PyUnicode_FromString(cp);
13094 if (s == NULL)
13095 return NULL;
13096 PyUnicode_InternInPlace(&s);
13097 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013098}
13099
Alexander Belopolsky40018472011-02-26 01:02:56 +000013100void
13101_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013102{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013103 PyObject *keys;
13104 PyUnicodeObject *s;
13105 Py_ssize_t i, n;
13106 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013107
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 if (interned == NULL || !PyDict_Check(interned))
13109 return;
13110 keys = PyDict_Keys(interned);
13111 if (keys == NULL || !PyList_Check(keys)) {
13112 PyErr_Clear();
13113 return;
13114 }
Walter Dörwald16807132007-05-25 13:52:07 +000013115
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13117 detector, interned unicode strings are not forcibly deallocated;
13118 rather, we give them their stolen references back, and then clear
13119 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013120
Benjamin Peterson14339b62009-01-31 16:36:08 +000013121 n = PyList_GET_SIZE(keys);
13122 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013124 for (i = 0; i < n; i++) {
13125 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 if (PyUnicode_READY(s) == -1)
13127 fprintf(stderr, "could not ready string\n");
13128 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013129 case SSTATE_NOT_INTERNED:
13130 /* XXX Shouldn't happen */
13131 break;
13132 case SSTATE_INTERNED_IMMORTAL:
13133 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 break;
13136 case SSTATE_INTERNED_MORTAL:
13137 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 break;
13140 default:
13141 Py_FatalError("Inconsistent interned string state.");
13142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013144 }
13145 fprintf(stderr, "total size of all interned strings: "
13146 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13147 "mortal/immortal\n", mortal_size, immortal_size);
13148 Py_DECREF(keys);
13149 PyDict_Clear(interned);
13150 Py_DECREF(interned);
13151 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013152}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013153
13154
13155/********************* Unicode Iterator **************************/
13156
13157typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 PyObject_HEAD
13159 Py_ssize_t it_index;
13160 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013161} unicodeiterobject;
13162
13163static void
13164unicodeiter_dealloc(unicodeiterobject *it)
13165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 _PyObject_GC_UNTRACK(it);
13167 Py_XDECREF(it->it_seq);
13168 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013169}
13170
13171static int
13172unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13173{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 Py_VISIT(it->it_seq);
13175 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013176}
13177
13178static PyObject *
13179unicodeiter_next(unicodeiterobject *it)
13180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 PyUnicodeObject *seq;
13182 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013183
Benjamin Peterson14339b62009-01-31 16:36:08 +000013184 assert(it != NULL);
13185 seq = it->it_seq;
13186 if (seq == NULL)
13187 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013188 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13191 int kind = PyUnicode_KIND(seq);
13192 void *data = PyUnicode_DATA(seq);
13193 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13194 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195 if (item != NULL)
13196 ++it->it_index;
13197 return item;
13198 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013199
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 Py_DECREF(seq);
13201 it->it_seq = NULL;
13202 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013203}
13204
13205static PyObject *
13206unicodeiter_len(unicodeiterobject *it)
13207{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013208 Py_ssize_t len = 0;
13209 if (it->it_seq)
13210 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13211 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013212}
13213
13214PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13215
13216static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013217 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013218 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013220};
13221
13222PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013223 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13224 "str_iterator", /* tp_name */
13225 sizeof(unicodeiterobject), /* tp_basicsize */
13226 0, /* tp_itemsize */
13227 /* methods */
13228 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13229 0, /* tp_print */
13230 0, /* tp_getattr */
13231 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013232 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013233 0, /* tp_repr */
13234 0, /* tp_as_number */
13235 0, /* tp_as_sequence */
13236 0, /* tp_as_mapping */
13237 0, /* tp_hash */
13238 0, /* tp_call */
13239 0, /* tp_str */
13240 PyObject_GenericGetAttr, /* tp_getattro */
13241 0, /* tp_setattro */
13242 0, /* tp_as_buffer */
13243 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13244 0, /* tp_doc */
13245 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13246 0, /* tp_clear */
13247 0, /* tp_richcompare */
13248 0, /* tp_weaklistoffset */
13249 PyObject_SelfIter, /* tp_iter */
13250 (iternextfunc)unicodeiter_next, /* tp_iternext */
13251 unicodeiter_methods, /* tp_methods */
13252 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013253};
13254
13255static PyObject *
13256unicode_iter(PyObject *seq)
13257{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013259
Benjamin Peterson14339b62009-01-31 16:36:08 +000013260 if (!PyUnicode_Check(seq)) {
13261 PyErr_BadInternalCall();
13262 return NULL;
13263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 if (PyUnicode_READY(seq) == -1)
13265 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13267 if (it == NULL)
13268 return NULL;
13269 it->it_index = 0;
13270 Py_INCREF(seq);
13271 it->it_seq = (PyUnicodeObject *)seq;
13272 _PyObject_GC_TRACK(it);
13273 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013274}
13275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276#define UNIOP(x) Py_UNICODE_##x
13277#define UNIOP_t Py_UNICODE
13278#include "uniops.h"
13279#undef UNIOP
13280#undef UNIOP_t
13281#define UNIOP(x) Py_UCS4_##x
13282#define UNIOP_t Py_UCS4
13283#include "uniops.h"
13284#undef UNIOP
13285#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013286
Victor Stinner71133ff2010-09-01 23:43:53 +000013287Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013288PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013289{
13290 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13291 Py_UNICODE *copy;
13292 Py_ssize_t size;
13293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 if (!PyUnicode_Check(unicode)) {
13295 PyErr_BadArgument();
13296 return NULL;
13297 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013298 /* Ensure we won't overflow the size. */
13299 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13300 PyErr_NoMemory();
13301 return NULL;
13302 }
13303 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13304 size *= sizeof(Py_UNICODE);
13305 copy = PyMem_Malloc(size);
13306 if (copy == NULL) {
13307 PyErr_NoMemory();
13308 return NULL;
13309 }
13310 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13311 return copy;
13312}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013313
Georg Brandl66c221e2010-10-14 07:04:07 +000013314/* A _string module, to export formatter_parser and formatter_field_name_split
13315 to the string.Formatter class implemented in Python. */
13316
13317static PyMethodDef _string_methods[] = {
13318 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13319 METH_O, PyDoc_STR("split the argument as a field name")},
13320 {"formatter_parser", (PyCFunction) formatter_parser,
13321 METH_O, PyDoc_STR("parse the argument as a format string")},
13322 {NULL, NULL}
13323};
13324
13325static struct PyModuleDef _string_module = {
13326 PyModuleDef_HEAD_INIT,
13327 "_string",
13328 PyDoc_STR("string helper module"),
13329 0,
13330 _string_methods,
13331 NULL,
13332 NULL,
13333 NULL,
13334 NULL
13335};
13336
13337PyMODINIT_FUNC
13338PyInit__string(void)
13339{
13340 return PyModule_Create(&_string_module);
13341}
13342
13343
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013344#ifdef __cplusplus
13345}
13346#endif