blob: 30db418832050b10bf596c1b1e82c29eb10eacd4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner910337b2011-10-03 03:20:16 +0200154/* Generic helper macro to convert characters of different types.
155 from_type and to_type have to be valid type names, begin and end
156 are pointers to the source characters which should be of type
157 "from_type *". to is a pointer of type "to_type *" and points to the
158 buffer where the result characters are written to. */
159#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
160 do { \
161 const from_type *iter_; to_type *to_; \
162 for (iter_ = (begin), to_ = (to_type *)(to); \
163 iter_ < (end); \
164 ++iter_, ++to_) { \
165 *to_ = (to_type)*iter_; \
166 } \
167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200169/* The Unicode string has been modified: reset the hash */
170#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
171
Walter Dörwald16807132007-05-25 13:52:07 +0000172/* This dictionary holds all interned unicode strings. Note that references
173 to strings in this dictionary are *not* counted in the string's ob_refcnt.
174 When the interned string reaches a refcnt of 0 the string deallocation
175 function will delete the reference from this dictionary.
176
177 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000178 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000179*/
180static PyObject *interned;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200183static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184
185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223
Alexander Belopolsky40018472011-02-26 01:02:56 +0000224static PyObject *
225unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000226 PyObject **errorHandler,const char *encoding, const char *reason,
227 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
228 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static void
231raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300232 const char *encoding,
233 const Py_UNICODE *unicode, Py_ssize_t size,
234 Py_ssize_t startpos, Py_ssize_t endpos,
235 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000236
Christian Heimes190d79e2008-01-30 11:58:22 +0000237/* Same for linebreaks */
238static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000240/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000241/* 0x000B, * LINE TABULATION */
242/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000243/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000244 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x001C, * FILE SEPARATOR */
247/* 0x001D, * GROUP SEPARATOR */
248/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000249 0, 0, 0, 0, 1, 1, 1, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000254
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000263};
264
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300265/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
266 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000267Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000268PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000269{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000270#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000272#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 /* This is actually an illegal character, so it should
274 not be passed to unichr. */
275 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000276#endif
277}
278
Victor Stinner910337b2011-10-03 03:20:16 +0200279#ifdef Py_DEBUG
280static int
281_PyUnicode_CheckConsistency(void *op)
282{
283 PyASCIIObject *ascii;
284 unsigned int kind;
285
286 assert(PyUnicode_Check(op));
287
288 ascii = (PyASCIIObject *)op;
289 kind = ascii->state.kind;
290
Victor Stinnera3b334d2011-10-03 13:53:37 +0200291 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200292 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200293 assert(ascii->state.ready == 1);
294 }
295 else if (ascii->state.compact == 1) {
296 assert(kind == PyUnicode_1BYTE_KIND
297 || kind == PyUnicode_2BYTE_KIND
298 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200299 assert(ascii->state.ascii == 0);
300 assert(ascii->state.ready == 1);
301 } else {
302 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
303 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
304
305 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ascii == 0);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200308 assert(ascii->state.ready == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200309 assert(ascii->wstr != NULL);
310 assert(unicode->data.any == NULL);
311 assert(compact->utf8 == NULL);
312 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
313 }
314 else {
315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
320 assert(unicode->data.any != NULL);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 }
322 }
323 return 1;
324}
325#endif
326
Thomas Wouters477c8d52006-05-27 19:21:47 +0000327/* --- Bloom Filters ----------------------------------------------------- */
328
329/* stuff to implement simple "bloom filters" for Unicode characters.
330 to keep things simple, we use a single bitmask, using the least 5
331 bits from each unicode characters as the bit index. */
332
333/* the linebreak mask is set up by Unicode_Init below */
334
Antoine Pitrouf068f942010-01-13 14:19:12 +0000335#if LONG_BIT >= 128
336#define BLOOM_WIDTH 128
337#elif LONG_BIT >= 64
338#define BLOOM_WIDTH 64
339#elif LONG_BIT >= 32
340#define BLOOM_WIDTH 32
341#else
342#error "LONG_BIT is smaller than 32"
343#endif
344
Thomas Wouters477c8d52006-05-27 19:21:47 +0000345#define BLOOM_MASK unsigned long
346
347static BLOOM_MASK bloom_linebreak;
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
350#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000351
Benjamin Peterson29060642009-01-31 22:14:21 +0000352#define BLOOM_LINEBREAK(ch) \
353 ((ch) < 128U ? ascii_linebreak[(ch)] : \
354 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
Alexander Belopolsky40018472011-02-26 01:02:56 +0000356Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200357make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000358{
359 /* calculate simple bloom-style bitmask for a given unicode string */
360
Antoine Pitrouf068f942010-01-13 14:19:12 +0000361 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362 Py_ssize_t i;
363
364 mask = 0;
365 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000367
368 return mask;
369}
370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371#define BLOOM_MEMBER(mask, chr, str) \
372 (BLOOM(mask, chr) \
373 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000374
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375/* --- Unicode Object ----------------------------------------------------- */
376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200378fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
379
380Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
381 Py_ssize_t size, Py_UCS4 ch,
382 int direction)
383{
384 /* like wcschr, but doesn't stop at NULL characters */
385 Py_ssize_t i;
386 if (direction == 1) {
387 for(i = 0; i < size; i++)
388 if (PyUnicode_READ(kind, s, i) == ch)
389 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
390 }
391 else {
392 for(i = size-1; i >= 0; i--)
393 if (PyUnicode_READ(kind, s, i) == ch)
394 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
395 }
396 return NULL;
397}
398
Victor Stinnerfe226c02011-10-03 03:52:20 +0200399static PyObject*
400resize_compact(PyObject *unicode, Py_ssize_t length)
401{
402 Py_ssize_t char_size;
403 Py_ssize_t struct_size;
404 Py_ssize_t new_size;
405 int share_wstr;
406
407 assert(PyUnicode_IS_READY(unicode));
408 char_size = PyUnicode_CHARACTER_SIZE(unicode);
409 if (PyUnicode_IS_COMPACT_ASCII(unicode))
410 struct_size = sizeof(PyASCIIObject);
411 else
412 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200413 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200414
415 _Py_DEC_REFTOTAL;
416 _Py_ForgetReference(unicode);
417
418 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
419 PyErr_NoMemory();
420 return NULL;
421 }
422 new_size = (struct_size + (length + 1) * char_size);
423
424 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
425 if (unicode == NULL) {
426 PyObject_Del(unicode);
427 PyErr_NoMemory();
428 return NULL;
429 }
430 _Py_NewReference(unicode);
431 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200432 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200433 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200434 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
435 _PyUnicode_WSTR_LENGTH(unicode) = length;
436 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
438 length, 0);
439 return unicode;
440}
441
Alexander Belopolsky40018472011-02-26 01:02:56 +0000442static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200443resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000444{
445 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200447 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200448
Victor Stinnerfe226c02011-10-03 03:52:20 +0200449 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200450 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
453 {
454 PyObject_DEL(_PyUnicode_UTF8(unicode));
455 _PyUnicode_UTF8(unicode) = NULL;
456 }
457
458 if (PyUnicode_IS_READY(unicode)) {
459 Py_ssize_t char_size;
460 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200461 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200462 void *data;
463
464 data = _PyUnicode_DATA_ANY(unicode);
465 assert(data != NULL);
466 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200467 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
468 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200469
470 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
471 PyErr_NoMemory();
472 return -1;
473 }
474 new_size = (length + 1) * char_size;
475
476 data = (PyObject *)PyObject_REALLOC(data, new_size);
477 if (data == NULL) {
478 PyErr_NoMemory();
479 return -1;
480 }
481 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200482 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200484 _PyUnicode_WSTR_LENGTH(unicode) = length;
485 }
486 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200487 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 _PyUnicode_UTF8_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 _PyUnicode_LENGTH(unicode) = length;
491 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
492 if (share_wstr)
493 return 0;
494 }
495 if (_PyUnicode_WSTR(unicode) != NULL) {
496 assert(_PyUnicode_WSTR(unicode) != NULL);
497
498 oldstr = _PyUnicode_WSTR(unicode);
499 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
500 sizeof(Py_UNICODE) * (length + 1));
501 if (!_PyUnicode_WSTR(unicode)) {
502 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
503 PyErr_NoMemory();
504 return -1;
505 }
506 _PyUnicode_WSTR(unicode)[length] = 0;
507 _PyUnicode_WSTR_LENGTH(unicode) = length;
508 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 return 0;
510}
511
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512static PyObject*
513resize_copy(PyObject *unicode, Py_ssize_t length)
514{
515 Py_ssize_t copy_length;
516 if (PyUnicode_IS_COMPACT(unicode)) {
517 PyObject *copy;
518 assert(PyUnicode_IS_READY(unicode));
519
520 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
521 if (copy == NULL)
522 return NULL;
523
524 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
525 if (PyUnicode_CopyCharacters(copy, 0,
526 unicode, 0,
527 copy_length) < 0)
528 {
529 Py_DECREF(copy);
530 return NULL;
531 }
532 return copy;
533 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200534 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 assert(_PyUnicode_WSTR(unicode) != NULL);
536 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200537 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200538 if (w == NULL)
539 return NULL;
540 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
541 copy_length = Py_MIN(copy_length, length);
542 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
543 copy_length);
544 return (PyObject*)w;
545 }
546}
547
Guido van Rossumd57fd912000-03-10 22:53:23 +0000548/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000549 Ux0000 terminated; some code (e.g. new_identifier)
550 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000551
552 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000553 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554
555*/
556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557#ifdef Py_DEBUG
558int unicode_old_new_calls = 0;
559#endif
560
Alexander Belopolsky40018472011-02-26 01:02:56 +0000561static PyUnicodeObject *
562_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563{
564 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000568 if (length == 0 && unicode_empty != NULL) {
569 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200570 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571 }
572
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000573 /* Ensure we won't overflow the size. */
574 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
575 return (PyUnicodeObject *)PyErr_NoMemory();
576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577 if (length < 0) {
578 PyErr_SetString(PyExc_SystemError,
579 "Negative size passed to _PyUnicode_New");
580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581 }
582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200583#ifdef Py_DEBUG
584 ++unicode_old_new_calls;
585#endif
586
587 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
588 if (unicode == NULL)
589 return NULL;
590 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
591 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
592 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000593 PyErr_NoMemory();
594 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200596
Jeremy Hyltond8082792003-09-16 19:41:39 +0000597 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000598 * the caller fails before initializing str -- unicode_resize()
599 * reads str[0], and the Keep-Alive optimization can keep memory
600 * allocated for str alive across a call to unicode_dealloc(unicode).
601 * We don't want unicode_resize to read uninitialized memory in
602 * that case.
603 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604 _PyUnicode_WSTR(unicode)[0] = 0;
605 _PyUnicode_WSTR(unicode)[length] = 0;
606 _PyUnicode_WSTR_LENGTH(unicode) = length;
607 _PyUnicode_HASH(unicode) = -1;
608 _PyUnicode_STATE(unicode).interned = 0;
609 _PyUnicode_STATE(unicode).kind = 0;
610 _PyUnicode_STATE(unicode).compact = 0;
611 _PyUnicode_STATE(unicode).ready = 0;
612 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200613 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200615 _PyUnicode_UTF8(unicode) = NULL;
616 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000618
Benjamin Peterson29060642009-01-31 22:14:21 +0000619 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000620 /* XXX UNREF/NEWREF interface should be more symmetrical */
621 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000622 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000623 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000625}
626
Victor Stinnerf42dc442011-10-02 23:33:16 +0200627static const char*
628unicode_kind_name(PyObject *unicode)
629{
Victor Stinner42dfd712011-10-03 14:41:45 +0200630 /* don't check consistency: unicode_kind_name() is called from
631 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200632 if (!PyUnicode_IS_COMPACT(unicode))
633 {
634 if (!PyUnicode_IS_READY(unicode))
635 return "wstr";
636 switch(PyUnicode_KIND(unicode))
637 {
638 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200639 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200640 return "legacy ascii";
641 else
642 return "legacy latin1";
643 case PyUnicode_2BYTE_KIND:
644 return "legacy UCS2";
645 case PyUnicode_4BYTE_KIND:
646 return "legacy UCS4";
647 default:
648 return "<legacy invalid kind>";
649 }
650 }
651 assert(PyUnicode_IS_READY(unicode));
652 switch(PyUnicode_KIND(unicode))
653 {
654 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200655 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200656 return "ascii";
657 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200658 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200659 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200660 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200661 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200662 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200663 default:
664 return "<invalid compact kind>";
665 }
666}
667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200668#ifdef Py_DEBUG
669int unicode_new_new_calls = 0;
670
671/* Functions wrapping macros for use in debugger */
672char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200674}
675
676void *_PyUnicode_compact_data(void *unicode) {
677 return _PyUnicode_COMPACT_DATA(unicode);
678}
679void *_PyUnicode_data(void *unicode){
680 printf("obj %p\n", unicode);
681 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
682 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
683 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
684 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
685 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
686 return PyUnicode_DATA(unicode);
687}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200688
689void
690_PyUnicode_Dump(PyObject *op)
691{
692 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200693 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
694 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
695 void *data;
696 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
697 if (ascii->state.compact)
698 data = (compact + 1);
699 else
700 data = unicode->data.any;
701 if (ascii->wstr == data)
702 printf("shared ");
703 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200704 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200705 printf(" (%zu), ", compact->wstr_length);
706 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
707 printf("shared ");
708 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200709 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200710 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200711}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712#endif
713
714PyObject *
715PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
716{
717 PyObject *obj;
718 PyCompactUnicodeObject *unicode;
719 void *data;
720 int kind_state;
721 int is_sharing = 0, is_ascii = 0;
722 Py_ssize_t char_size;
723 Py_ssize_t struct_size;
724
725 /* Optimization for empty strings */
726 if (size == 0 && unicode_empty != NULL) {
727 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200728 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 }
730
731#ifdef Py_DEBUG
732 ++unicode_new_new_calls;
733#endif
734
735 struct_size = sizeof(PyCompactUnicodeObject);
736 if (maxchar < 128) {
737 kind_state = PyUnicode_1BYTE_KIND;
738 char_size = 1;
739 is_ascii = 1;
740 struct_size = sizeof(PyASCIIObject);
741 }
742 else if (maxchar < 256) {
743 kind_state = PyUnicode_1BYTE_KIND;
744 char_size = 1;
745 }
746 else if (maxchar < 65536) {
747 kind_state = PyUnicode_2BYTE_KIND;
748 char_size = 2;
749 if (sizeof(wchar_t) == 2)
750 is_sharing = 1;
751 }
752 else {
753 kind_state = PyUnicode_4BYTE_KIND;
754 char_size = 4;
755 if (sizeof(wchar_t) == 4)
756 is_sharing = 1;
757 }
758
759 /* Ensure we won't overflow the size. */
760 if (size < 0) {
761 PyErr_SetString(PyExc_SystemError,
762 "Negative size passed to PyUnicode_New");
763 return NULL;
764 }
765 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
766 return PyErr_NoMemory();
767
768 /* Duplicated allocation code from _PyObject_New() instead of a call to
769 * PyObject_New() so we are able to allocate space for the object and
770 * it's data buffer.
771 */
772 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
773 if (obj == NULL)
774 return PyErr_NoMemory();
775 obj = PyObject_INIT(obj, &PyUnicode_Type);
776 if (obj == NULL)
777 return NULL;
778
779 unicode = (PyCompactUnicodeObject *)obj;
780 if (is_ascii)
781 data = ((PyASCIIObject*)obj) + 1;
782 else
783 data = unicode + 1;
784 _PyUnicode_LENGTH(unicode) = size;
785 _PyUnicode_HASH(unicode) = -1;
786 _PyUnicode_STATE(unicode).interned = 0;
787 _PyUnicode_STATE(unicode).kind = kind_state;
788 _PyUnicode_STATE(unicode).compact = 1;
789 _PyUnicode_STATE(unicode).ready = 1;
790 _PyUnicode_STATE(unicode).ascii = is_ascii;
791 if (is_ascii) {
792 ((char*)data)[size] = 0;
793 _PyUnicode_WSTR(unicode) = NULL;
794 }
795 else if (kind_state == PyUnicode_1BYTE_KIND) {
796 ((char*)data)[size] = 0;
797 _PyUnicode_WSTR(unicode) = NULL;
798 _PyUnicode_WSTR_LENGTH(unicode) = 0;
799 unicode->utf8_length = 0;
800 unicode->utf8 = NULL;
801 }
802 else {
803 unicode->utf8 = NULL;
804 if (kind_state == PyUnicode_2BYTE_KIND)
805 ((Py_UCS2*)data)[size] = 0;
806 else /* kind_state == PyUnicode_4BYTE_KIND */
807 ((Py_UCS4*)data)[size] = 0;
808 if (is_sharing) {
809 _PyUnicode_WSTR_LENGTH(unicode) = size;
810 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
811 }
812 else {
813 _PyUnicode_WSTR_LENGTH(unicode) = 0;
814 _PyUnicode_WSTR(unicode) = NULL;
815 }
816 }
817 return obj;
818}
819
820#if SIZEOF_WCHAR_T == 2
821/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
822 will decode surrogate pairs, the other conversions are implemented as macros
823 for efficency.
824
825 This function assumes that unicode can hold one more code point than wstr
826 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200827static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
829 PyUnicodeObject *unicode)
830{
831 const wchar_t *iter;
832 Py_UCS4 *ucs4_out;
833
Victor Stinner910337b2011-10-03 03:20:16 +0200834 assert(unicode != NULL);
835 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
837 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
838
839 for (iter = begin; iter < end; ) {
840 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
841 _PyUnicode_GET_LENGTH(unicode)));
842 if (*iter >= 0xD800 && *iter <= 0xDBFF
843 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
844 {
845 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
846 iter += 2;
847 }
848 else {
849 *ucs4_out++ = *iter;
850 iter++;
851 }
852 }
853 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
854 _PyUnicode_GET_LENGTH(unicode)));
855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856}
857#endif
858
Victor Stinnercd9950f2011-10-02 00:34:53 +0200859static int
860_PyUnicode_Dirty(PyObject *unicode)
861{
Victor Stinner910337b2011-10-03 03:20:16 +0200862 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200863 if (Py_REFCNT(unicode) != 1) {
864 PyErr_SetString(PyExc_ValueError,
865 "Cannot modify a string having more than 1 reference");
866 return -1;
867 }
868 _PyUnicode_DIRTY(unicode);
869 return 0;
870}
871
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200872Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200873PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
874 PyObject *from, Py_ssize_t from_start,
875 Py_ssize_t how_many)
876{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200877 unsigned int from_kind, to_kind;
878 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879
Victor Stinnerb1536152011-09-30 02:26:10 +0200880 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
881 PyErr_BadInternalCall();
882 return -1;
883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884
885 if (PyUnicode_READY(from))
886 return -1;
887 if (PyUnicode_READY(to))
888 return -1;
889
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200890 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200891 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
892 PyErr_Format(PyExc_ValueError,
893 "Cannot write %zi characters at %zi "
894 "in a string of %zi characters",
895 how_many, to_start, PyUnicode_GET_LENGTH(to));
896 return -1;
897 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200898 if (how_many == 0)
899 return 0;
900
Victor Stinnercd9950f2011-10-02 00:34:53 +0200901 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200902 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200905 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200907 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
Victor Stinnerf42dc442011-10-02 23:33:16 +0200909 if (from_kind == to_kind
910 /* deny latin1 => ascii */
911 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
912 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200913 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200914 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200915 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200916 + PyUnicode_KIND_SIZE(from_kind, from_start),
917 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200918 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200919 else if (from_kind == PyUnicode_1BYTE_KIND
920 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200921 {
922 _PyUnicode_CONVERT_BYTES(
923 Py_UCS1, Py_UCS2,
924 PyUnicode_1BYTE_DATA(from) + from_start,
925 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
926 PyUnicode_2BYTE_DATA(to) + to_start
927 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200928 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200929 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200930 && to_kind == PyUnicode_4BYTE_KIND)
931 {
932 _PyUnicode_CONVERT_BYTES(
933 Py_UCS1, Py_UCS4,
934 PyUnicode_1BYTE_DATA(from) + from_start,
935 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
936 PyUnicode_4BYTE_DATA(to) + to_start
937 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200938 }
939 else if (from_kind == PyUnicode_2BYTE_KIND
940 && to_kind == PyUnicode_4BYTE_KIND)
941 {
942 _PyUnicode_CONVERT_BYTES(
943 Py_UCS2, Py_UCS4,
944 PyUnicode_2BYTE_DATA(from) + from_start,
945 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
946 PyUnicode_4BYTE_DATA(to) + to_start
947 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200948 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200949 else {
950 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951
952 /* check if max_char(from substring) <= max_char(to) */
953 if (from_kind > to_kind
954 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +0200957 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200959 /* slow path to check for character overflow */
960 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
961 Py_UCS4 ch, maxchar;
962 Py_ssize_t i;
963
964 maxchar = 0;
965 invalid_kinds = 0;
966 for (i=0; i < how_many; i++) {
967 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
968 if (ch > maxchar) {
969 maxchar = ch;
970 if (maxchar > to_maxchar) {
971 invalid_kinds = 1;
972 break;
973 }
974 }
975 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
976 }
977 }
978 else
979 invalid_kinds = 1;
980 if (invalid_kinds) {
981 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200982 "Cannot copy %s characters "
983 "into a string of %s characters",
984 unicode_kind_name(from),
985 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200986 return -1;
987 }
988 }
989 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990}
991
Victor Stinner17222162011-09-28 22:15:37 +0200992/* Find the maximum code point and count the number of surrogate pairs so a
993 correct string length can be computed before converting a string to UCS4.
994 This function counts single surrogates as a character and not as a pair.
995
996 Return 0 on success, or -1 on error. */
997static int
998find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
999 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000{
1001 const wchar_t *iter;
1002
Victor Stinnerc53be962011-10-02 21:33:54 +02001003 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001004 if (num_surrogates == NULL || maxchar == NULL) {
1005 PyErr_SetString(PyExc_SystemError,
1006 "unexpected NULL arguments to "
1007 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1008 return -1;
1009 }
1010
1011 *num_surrogates = 0;
1012 *maxchar = 0;
1013
1014 for (iter = begin; iter < end; ) {
1015 if (*iter > *maxchar)
1016 *maxchar = *iter;
1017#if SIZEOF_WCHAR_T == 2
1018 if (*iter >= 0xD800 && *iter <= 0xDBFF
1019 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1020 {
1021 Py_UCS4 surrogate_val;
1022 surrogate_val = (((iter[0] & 0x3FF)<<10)
1023 | (iter[1] & 0x3FF)) + 0x10000;
1024 ++(*num_surrogates);
1025 if (surrogate_val > *maxchar)
1026 *maxchar = surrogate_val;
1027 iter += 2;
1028 }
1029 else
1030 iter++;
1031#else
1032 iter++;
1033#endif
1034 }
1035 return 0;
1036}
1037
1038#ifdef Py_DEBUG
1039int unicode_ready_calls = 0;
1040#endif
1041
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001042static int
1043unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001044{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001045 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 wchar_t *end;
1047 Py_UCS4 maxchar = 0;
1048 Py_ssize_t num_surrogates;
1049#if SIZEOF_WCHAR_T == 2
1050 Py_ssize_t length_wo_surrogates;
1051#endif
1052
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001053 assert(p_obj != NULL);
1054 unicode = (PyUnicodeObject *)*p_obj;
1055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001057 strings were created using _PyObject_New() and where no canonical
1058 representation (the str field) has been set yet aka strings
1059 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001060 assert(_PyUnicode_CHECK(unicode));
1061 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001063 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001064 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001065 /* Actually, it should neither be interned nor be anything else: */
1066 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067
1068#ifdef Py_DEBUG
1069 ++unicode_ready_calls;
1070#endif
1071
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001072#ifdef Py_DEBUG
1073 assert(!replace || Py_REFCNT(unicode) == 1);
1074#else
1075 if (replace && Py_REFCNT(unicode) != 1)
1076 replace = 0;
1077#endif
1078 if (replace) {
1079 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1080 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1081 /* Optimization for empty strings */
1082 if (len == 0) {
1083 Py_INCREF(unicode_empty);
1084 Py_DECREF(*p_obj);
1085 *p_obj = unicode_empty;
1086 return 0;
1087 }
1088 if (len == 1 && wstr[0] < 256) {
1089 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1090 if (latin1_char == NULL)
1091 return -1;
1092 Py_DECREF(*p_obj);
1093 *p_obj = latin1_char;
1094 return 0;
1095 }
1096 }
1097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001099 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001100 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102
1103 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001104 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1105 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106 PyErr_NoMemory();
1107 return -1;
1108 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001109 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110 _PyUnicode_WSTR(unicode), end,
1111 PyUnicode_1BYTE_DATA(unicode));
1112 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1113 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1114 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1115 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001116 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001117 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001118 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 }
1120 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001121 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001122 _PyUnicode_UTF8(unicode) = NULL;
1123 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124 }
1125 PyObject_FREE(_PyUnicode_WSTR(unicode));
1126 _PyUnicode_WSTR(unicode) = NULL;
1127 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1128 }
1129 /* In this case we might have to convert down from 4-byte native
1130 wchar_t to 2-byte unicode. */
1131 else if (maxchar < 65536) {
1132 assert(num_surrogates == 0 &&
1133 "FindMaxCharAndNumSurrogatePairs() messed up");
1134
Victor Stinner506f5922011-09-28 22:34:18 +02001135#if SIZEOF_WCHAR_T == 2
1136 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001137 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001138 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1139 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1140 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001141 _PyUnicode_UTF8(unicode) = NULL;
1142 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001143#else
1144 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001145 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001146 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001147 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001148 PyErr_NoMemory();
1149 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150 }
Victor Stinner506f5922011-09-28 22:34:18 +02001151 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1152 _PyUnicode_WSTR(unicode), end,
1153 PyUnicode_2BYTE_DATA(unicode));
1154 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1155 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1156 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001157 _PyUnicode_UTF8(unicode) = NULL;
1158 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001159 PyObject_FREE(_PyUnicode_WSTR(unicode));
1160 _PyUnicode_WSTR(unicode) = NULL;
1161 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1162#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 }
1164 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1165 else {
1166#if SIZEOF_WCHAR_T == 2
1167 /* in case the native representation is 2-bytes, we need to allocate a
1168 new normalized 4-byte version. */
1169 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001170 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1171 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001172 PyErr_NoMemory();
1173 return -1;
1174 }
1175 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1176 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001177 _PyUnicode_UTF8(unicode) = NULL;
1178 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001179 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1180 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001181 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182 PyObject_FREE(_PyUnicode_WSTR(unicode));
1183 _PyUnicode_WSTR(unicode) = NULL;
1184 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1185#else
1186 assert(num_surrogates == 0);
1187
Victor Stinnerc3c74152011-10-02 20:39:55 +02001188 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001190 _PyUnicode_UTF8(unicode) = NULL;
1191 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1193#endif
1194 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1195 }
1196 _PyUnicode_STATE(unicode).ready = 1;
1197 return 0;
1198}
1199
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001200int
1201_PyUnicode_ReadyReplace(PyObject **op)
1202{
1203 return unicode_ready(op, 1);
1204}
1205
1206int
1207_PyUnicode_Ready(PyObject *op)
1208{
1209 return unicode_ready(&op, 0);
1210}
1211
Alexander Belopolsky40018472011-02-26 01:02:56 +00001212static void
1213unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
Walter Dörwald16807132007-05-25 13:52:07 +00001215 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001216 case SSTATE_NOT_INTERNED:
1217 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001218
Benjamin Peterson29060642009-01-31 22:14:21 +00001219 case SSTATE_INTERNED_MORTAL:
1220 /* revive dead object temporarily for DelItem */
1221 Py_REFCNT(unicode) = 3;
1222 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1223 Py_FatalError(
1224 "deletion of interned string failed");
1225 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001226
Benjamin Peterson29060642009-01-31 22:14:21 +00001227 case SSTATE_INTERNED_IMMORTAL:
1228 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001229
Benjamin Peterson29060642009-01-31 22:14:21 +00001230 default:
1231 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001232 }
1233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001234 if (_PyUnicode_WSTR(unicode) &&
1235 (!PyUnicode_IS_READY(unicode) ||
1236 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1237 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001238 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001239 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001240
1241 if (PyUnicode_IS_COMPACT(unicode)) {
1242 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001245 if (_PyUnicode_DATA_ANY(unicode))
1246 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001247 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 }
1249}
1250
Alexander Belopolsky40018472011-02-26 01:02:56 +00001251static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001252unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001253{
Victor Stinnera3be6132011-10-03 02:16:37 +02001254 Py_ssize_t len;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001255 if (Py_REFCNT(unicode) != 1)
1256 return 0;
1257 if (PyUnicode_CHECK_INTERNED(unicode))
1258 return 0;
1259 if (unicode == unicode_empty)
1260 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001261 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1262 len = PyUnicode_WSTR_LENGTH(unicode);
1263 else
1264 len = PyUnicode_GET_LENGTH(unicode);
1265 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001266 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001267 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001268 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001269 else
1270 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001271 if (ch < 256 && unicode_latin1[ch] == unicode)
1272 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001273 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001274 return 1;
1275}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001276
Victor Stinnerfe226c02011-10-03 03:52:20 +02001277static int
1278unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1279{
1280 PyObject *unicode;
1281 Py_ssize_t old_length;
1282
1283 assert(p_unicode != NULL);
1284 unicode = *p_unicode;
1285
1286 assert(unicode != NULL);
1287 assert(PyUnicode_Check(unicode));
1288 assert(0 <= length);
1289
Victor Stinner910337b2011-10-03 03:20:16 +02001290 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001291 old_length = PyUnicode_WSTR_LENGTH(unicode);
1292 else
1293 old_length = PyUnicode_GET_LENGTH(unicode);
1294 if (old_length == length)
1295 return 0;
1296
1297 /* FIXME: really create a new object? */
1298 if (!unicode_resizable(unicode)) {
1299 PyObject *copy = resize_copy(unicode, length);
1300 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001301 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001302 Py_DECREF(*p_unicode);
1303 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001305 }
1306
Victor Stinnerfe226c02011-10-03 03:52:20 +02001307 if (PyUnicode_IS_COMPACT(unicode)) {
1308 *p_unicode = resize_compact(unicode, length);
1309 if (*p_unicode == NULL)
1310 return -1;
1311 return 0;
1312 } else
1313 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001314}
1315
Alexander Belopolsky40018472011-02-26 01:02:56 +00001316int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001317PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001318{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001319 PyObject *unicode;
1320 if (p_unicode == NULL) {
1321 PyErr_BadInternalCall();
1322 return -1;
1323 }
1324 unicode = *p_unicode;
1325 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1326 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1327 {
1328 PyErr_BadInternalCall();
1329 return -1;
1330 }
1331 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001332}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334static PyObject*
1335get_latin1_char(unsigned char ch)
1336{
Victor Stinnera464fc12011-10-02 20:39:30 +02001337 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001339 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 if (!unicode)
1341 return NULL;
1342 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1343 unicode_latin1[ch] = unicode;
1344 }
1345 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001346 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347}
1348
Alexander Belopolsky40018472011-02-26 01:02:56 +00001349PyObject *
1350PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351{
1352 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353 Py_UCS4 maxchar = 0;
1354 Py_ssize_t num_surrogates;
1355
1356 if (u == NULL)
1357 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001359 /* If the Unicode data is known at construction time, we can apply
1360 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 /* Optimization for empty strings */
1363 if (size == 0 && unicode_empty != NULL) {
1364 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001365 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001366 }
Tim Petersced69f82003-09-16 20:30:58 +00001367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 /* Single character Unicode objects in the Latin-1 range are
1369 shared when using this constructor */
1370 if (size == 1 && *u < 256)
1371 return get_latin1_char((unsigned char)*u);
1372
1373 /* If not empty and not single character, copy the Unicode data
1374 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 if (find_maxchar_surrogates(u, u + size,
1376 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 return NULL;
1378
1379 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1380 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 if (!unicode)
1382 return NULL;
1383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 switch (PyUnicode_KIND(unicode)) {
1385 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001386 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1388 break;
1389 case PyUnicode_2BYTE_KIND:
1390#if Py_UNICODE_SIZE == 2
1391 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1392#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001393 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1395#endif
1396 break;
1397 case PyUnicode_4BYTE_KIND:
1398#if SIZEOF_WCHAR_T == 2
1399 /* This is the only case which has to process surrogates, thus
1400 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001401 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402#else
1403 assert(num_surrogates == 0);
1404 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1405#endif
1406 break;
1407 default:
1408 assert(0 && "Impossible state");
1409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410
1411 return (PyObject *)unicode;
1412}
1413
Alexander Belopolsky40018472011-02-26 01:02:56 +00001414PyObject *
1415PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001416{
1417 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001418
Benjamin Peterson14339b62009-01-31 16:36:08 +00001419 if (size < 0) {
1420 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001422 return NULL;
1423 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001424
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001425 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001426 some optimizations which share commonly used objects.
1427 Also, this means the input must be UTF-8, so fall back to the
1428 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001429 if (u != NULL) {
1430
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 /* Optimization for empty strings */
1432 if (size == 0 && unicode_empty != NULL) {
1433 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001434 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001436
1437 /* Single characters are shared when using this constructor.
1438 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 if (size == 1 && Py_CHARMASK(*u) < 128)
1440 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001441
1442 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001443 }
1444
Walter Dörwald55507312007-05-18 13:12:10 +00001445 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001446 if (!unicode)
1447 return NULL;
1448
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001449 return (PyObject *)unicode;
1450}
1451
Alexander Belopolsky40018472011-02-26 01:02:56 +00001452PyObject *
1453PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001454{
1455 size_t size = strlen(u);
1456 if (size > PY_SSIZE_T_MAX) {
1457 PyErr_SetString(PyExc_OverflowError, "input too long");
1458 return NULL;
1459 }
1460
1461 return PyUnicode_FromStringAndSize(u, size);
1462}
1463
Victor Stinnere57b1c02011-09-28 22:20:48 +02001464static PyObject*
1465_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyObject *res;
1468 unsigned char max = 127;
1469 Py_ssize_t i;
1470 for (i = 0; i < size; i++) {
1471 if (u[i] & 0x80) {
1472 max = 255;
1473 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001474 }
1475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 res = PyUnicode_New(size, max);
1477 if (!res)
1478 return NULL;
1479 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1480 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001481}
1482
Victor Stinnere57b1c02011-09-28 22:20:48 +02001483static PyObject*
1484_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485{
1486 PyObject *res;
1487 Py_UCS2 max = 0;
1488 Py_ssize_t i;
1489 for (i = 0; i < size; i++)
1490 if (u[i] > max)
1491 max = u[i];
1492 res = PyUnicode_New(size, max);
1493 if (!res)
1494 return NULL;
1495 if (max >= 256)
1496 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1497 else
1498 for (i = 0; i < size; i++)
1499 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1500 return res;
1501}
1502
Victor Stinnere57b1c02011-09-28 22:20:48 +02001503static PyObject*
1504_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505{
1506 PyObject *res;
1507 Py_UCS4 max = 0;
1508 Py_ssize_t i;
1509 for (i = 0; i < size; i++)
1510 if (u[i] > max)
1511 max = u[i];
1512 res = PyUnicode_New(size, max);
1513 if (!res)
1514 return NULL;
1515 if (max >= 0x10000)
1516 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1517 else {
1518 int kind = PyUnicode_KIND(res);
1519 void *data = PyUnicode_DATA(res);
1520 for (i = 0; i < size; i++)
1521 PyUnicode_WRITE(kind, data, i, u[i]);
1522 }
1523 return res;
1524}
1525
1526PyObject*
1527PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1528{
1529 switch(kind) {
1530 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001531 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001533 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001535 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001537 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 return NULL;
1539}
1540
Victor Stinner034f6cf2011-09-30 02:26:44 +02001541PyObject*
1542PyUnicode_Copy(PyObject *unicode)
1543{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001544 Py_ssize_t size;
1545 PyObject *copy;
1546 void *data;
1547
Victor Stinner034f6cf2011-09-30 02:26:44 +02001548 if (!PyUnicode_Check(unicode)) {
1549 PyErr_BadInternalCall();
1550 return NULL;
1551 }
1552 if (PyUnicode_READY(unicode))
1553 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001554
1555 size = PyUnicode_GET_LENGTH(unicode);
1556 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1557 if (!copy)
1558 return NULL;
1559 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1560
1561 data = PyUnicode_DATA(unicode);
1562 switch (PyUnicode_KIND(unicode))
1563 {
1564 case PyUnicode_1BYTE_KIND:
1565 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1566 break;
1567 case PyUnicode_2BYTE_KIND:
1568 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1569 break;
1570 case PyUnicode_4BYTE_KIND:
1571 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1572 break;
1573 default:
1574 assert(0);
1575 break;
1576 }
1577 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001578}
1579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580
Victor Stinnerbc603d12011-10-02 01:00:40 +02001581/* Widen Unicode objects to larger buffers. Don't write terminating null
1582 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583
1584void*
1585_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1586{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001587 Py_ssize_t len;
1588 void *result;
1589 unsigned int skind;
1590
1591 if (PyUnicode_READY(s))
1592 return NULL;
1593
1594 len = PyUnicode_GET_LENGTH(s);
1595 skind = PyUnicode_KIND(s);
1596 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1598 return NULL;
1599 }
1600 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001601 case PyUnicode_2BYTE_KIND:
1602 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1603 if (!result)
1604 return PyErr_NoMemory();
1605 assert(skind == PyUnicode_1BYTE_KIND);
1606 _PyUnicode_CONVERT_BYTES(
1607 Py_UCS1, Py_UCS2,
1608 PyUnicode_1BYTE_DATA(s),
1609 PyUnicode_1BYTE_DATA(s) + len,
1610 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001612 case PyUnicode_4BYTE_KIND:
1613 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1614 if (!result)
1615 return PyErr_NoMemory();
1616 if (skind == PyUnicode_2BYTE_KIND) {
1617 _PyUnicode_CONVERT_BYTES(
1618 Py_UCS2, Py_UCS4,
1619 PyUnicode_2BYTE_DATA(s),
1620 PyUnicode_2BYTE_DATA(s) + len,
1621 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001623 else {
1624 assert(skind == PyUnicode_1BYTE_KIND);
1625 _PyUnicode_CONVERT_BYTES(
1626 Py_UCS1, Py_UCS4,
1627 PyUnicode_1BYTE_DATA(s),
1628 PyUnicode_1BYTE_DATA(s) + len,
1629 result);
1630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001632 default:
1633 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001635 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 return NULL;
1637}
1638
1639static Py_UCS4*
1640as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1641 int copy_null)
1642{
1643 int kind;
1644 void *data;
1645 Py_ssize_t len, targetlen;
1646 if (PyUnicode_READY(string) == -1)
1647 return NULL;
1648 kind = PyUnicode_KIND(string);
1649 data = PyUnicode_DATA(string);
1650 len = PyUnicode_GET_LENGTH(string);
1651 targetlen = len;
1652 if (copy_null)
1653 targetlen++;
1654 if (!target) {
1655 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1656 PyErr_NoMemory();
1657 return NULL;
1658 }
1659 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1660 if (!target) {
1661 PyErr_NoMemory();
1662 return NULL;
1663 }
1664 }
1665 else {
1666 if (targetsize < targetlen) {
1667 PyErr_Format(PyExc_SystemError,
1668 "string is longer than the buffer");
1669 if (copy_null && 0 < targetsize)
1670 target[0] = 0;
1671 return NULL;
1672 }
1673 }
1674 if (kind != PyUnicode_4BYTE_KIND) {
1675 Py_ssize_t i;
1676 for (i = 0; i < len; i++)
1677 target[i] = PyUnicode_READ(kind, data, i);
1678 }
1679 else
1680 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1681 if (copy_null)
1682 target[len] = 0;
1683 return target;
1684}
1685
1686Py_UCS4*
1687PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1688 int copy_null)
1689{
1690 if (target == NULL || targetsize < 1) {
1691 PyErr_BadInternalCall();
1692 return NULL;
1693 }
1694 return as_ucs4(string, target, targetsize, copy_null);
1695}
1696
1697Py_UCS4*
1698PyUnicode_AsUCS4Copy(PyObject *string)
1699{
1700 return as_ucs4(string, NULL, 0, 1);
1701}
1702
1703#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001704
Alexander Belopolsky40018472011-02-26 01:02:56 +00001705PyObject *
1706PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001709 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001711 PyErr_BadInternalCall();
1712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 }
1714
Martin v. Löwis790465f2008-04-05 20:41:37 +00001715 if (size == -1) {
1716 size = wcslen(w);
1717 }
1718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720}
1721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001723
Walter Dörwald346737f2007-05-31 10:44:43 +00001724static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001725makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1726 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001727{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001728 *fmt++ = '%';
1729 if (width) {
1730 if (zeropad)
1731 *fmt++ = '0';
1732 fmt += sprintf(fmt, "%d", width);
1733 }
1734 if (precision)
1735 fmt += sprintf(fmt, ".%d", precision);
1736 if (longflag)
1737 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001738 else if (longlongflag) {
1739 /* longlongflag should only ever be nonzero on machines with
1740 HAVE_LONG_LONG defined */
1741#ifdef HAVE_LONG_LONG
1742 char *f = PY_FORMAT_LONG_LONG;
1743 while (*f)
1744 *fmt++ = *f++;
1745#else
1746 /* we shouldn't ever get here */
1747 assert(0);
1748 *fmt++ = 'l';
1749#endif
1750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001751 else if (size_tflag) {
1752 char *f = PY_FORMAT_SIZE_T;
1753 while (*f)
1754 *fmt++ = *f++;
1755 }
1756 *fmt++ = c;
1757 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001758}
1759
Victor Stinner96865452011-03-01 23:44:09 +00001760/* helper for PyUnicode_FromFormatV() */
1761
1762static const char*
1763parse_format_flags(const char *f,
1764 int *p_width, int *p_precision,
1765 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1766{
1767 int width, precision, longflag, longlongflag, size_tflag;
1768
1769 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1770 f++;
1771 width = 0;
1772 while (Py_ISDIGIT((unsigned)*f))
1773 width = (width*10) + *f++ - '0';
1774 precision = 0;
1775 if (*f == '.') {
1776 f++;
1777 while (Py_ISDIGIT((unsigned)*f))
1778 precision = (precision*10) + *f++ - '0';
1779 if (*f == '%') {
1780 /* "%.3%s" => f points to "3" */
1781 f--;
1782 }
1783 }
1784 if (*f == '\0') {
1785 /* bogus format "%.1" => go backward, f points to "1" */
1786 f--;
1787 }
1788 if (p_width != NULL)
1789 *p_width = width;
1790 if (p_precision != NULL)
1791 *p_precision = precision;
1792
1793 /* Handle %ld, %lu, %lld and %llu. */
1794 longflag = 0;
1795 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001796 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001797
1798 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001799 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001800 longflag = 1;
1801 ++f;
1802 }
1803#ifdef HAVE_LONG_LONG
1804 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001805 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001806 longlongflag = 1;
1807 f += 2;
1808 }
1809#endif
1810 }
1811 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001812 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001813 size_tflag = 1;
1814 ++f;
1815 }
1816 if (p_longflag != NULL)
1817 *p_longflag = longflag;
1818 if (p_longlongflag != NULL)
1819 *p_longlongflag = longlongflag;
1820 if (p_size_tflag != NULL)
1821 *p_size_tflag = size_tflag;
1822 return f;
1823}
1824
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001825/* maximum number of characters required for output of %ld. 21 characters
1826 allows for 64-bit integers (in decimal) and an optional sign. */
1827#define MAX_LONG_CHARS 21
1828/* maximum number of characters required for output of %lld.
1829 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1830 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1831#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1832
Walter Dörwaldd2034312007-05-18 16:29:38 +00001833PyObject *
1834PyUnicode_FromFormatV(const char *format, va_list vargs)
1835{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 va_list count;
1837 Py_ssize_t callcount = 0;
1838 PyObject **callresults = NULL;
1839 PyObject **callresult = NULL;
1840 Py_ssize_t n = 0;
1841 int width = 0;
1842 int precision = 0;
1843 int zeropad;
1844 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001846 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001847 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1849 Py_UCS4 argmaxchar;
1850 Py_ssize_t numbersize = 0;
1851 char *numberresults = NULL;
1852 char *numberresult = NULL;
1853 Py_ssize_t i;
1854 int kind;
1855 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001856
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001857 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001858 /* step 1: count the number of %S/%R/%A/%s format specifications
1859 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1860 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 * result in an array)
1862 * also esimate a upper bound for all the number formats in the string,
1863 * numbers will be formated in step 3 and be keept in a '\0'-separated
1864 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 for (f = format; *f; f++) {
1866 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001867 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1869 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1870 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1871 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001874#ifdef HAVE_LONG_LONG
1875 if (longlongflag) {
1876 if (width < MAX_LONG_LONG_CHARS)
1877 width = MAX_LONG_LONG_CHARS;
1878 }
1879 else
1880#endif
1881 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1882 including sign. Decimal takes the most space. This
1883 isn't enough for octal. If a width is specified we
1884 need more (which we allocate later). */
1885 if (width < MAX_LONG_CHARS)
1886 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887
1888 /* account for the size + '\0' to separate numbers
1889 inside of the numberresults buffer */
1890 numbersize += (width + 1);
1891 }
1892 }
1893 else if ((unsigned char)*f > 127) {
1894 PyErr_Format(PyExc_ValueError,
1895 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1896 "string, got a non-ASCII byte: 0x%02x",
1897 (unsigned char)*f);
1898 return NULL;
1899 }
1900 }
1901 /* step 2: allocate memory for the results of
1902 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1903 if (callcount) {
1904 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1905 if (!callresults) {
1906 PyErr_NoMemory();
1907 return NULL;
1908 }
1909 callresult = callresults;
1910 }
1911 /* step 2.5: allocate memory for the results of formating numbers */
1912 if (numbersize) {
1913 numberresults = PyObject_Malloc(numbersize);
1914 if (!numberresults) {
1915 PyErr_NoMemory();
1916 goto fail;
1917 }
1918 numberresult = numberresults;
1919 }
1920
1921 /* step 3: format numbers and figure out how large a buffer we need */
1922 for (f = format; *f; f++) {
1923 if (*f == '%') {
1924 const char* p;
1925 int longflag;
1926 int longlongflag;
1927 int size_tflag;
1928 int numprinted;
1929
1930 p = f;
1931 zeropad = (f[1] == '0');
1932 f = parse_format_flags(f, &width, &precision,
1933 &longflag, &longlongflag, &size_tflag);
1934 switch (*f) {
1935 case 'c':
1936 {
1937 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001938 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 n++;
1940 break;
1941 }
1942 case '%':
1943 n++;
1944 break;
1945 case 'i':
1946 case 'd':
1947 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1948 width, precision, *f);
1949 if (longflag)
1950 numprinted = sprintf(numberresult, fmt,
1951 va_arg(count, long));
1952#ifdef HAVE_LONG_LONG
1953 else if (longlongflag)
1954 numprinted = sprintf(numberresult, fmt,
1955 va_arg(count, PY_LONG_LONG));
1956#endif
1957 else if (size_tflag)
1958 numprinted = sprintf(numberresult, fmt,
1959 va_arg(count, Py_ssize_t));
1960 else
1961 numprinted = sprintf(numberresult, fmt,
1962 va_arg(count, int));
1963 n += numprinted;
1964 /* advance by +1 to skip over the '\0' */
1965 numberresult += (numprinted + 1);
1966 assert(*(numberresult - 1) == '\0');
1967 assert(*(numberresult - 2) != '\0');
1968 assert(numprinted >= 0);
1969 assert(numberresult <= numberresults + numbersize);
1970 break;
1971 case 'u':
1972 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1973 width, precision, 'u');
1974 if (longflag)
1975 numprinted = sprintf(numberresult, fmt,
1976 va_arg(count, unsigned long));
1977#ifdef HAVE_LONG_LONG
1978 else if (longlongflag)
1979 numprinted = sprintf(numberresult, fmt,
1980 va_arg(count, unsigned PY_LONG_LONG));
1981#endif
1982 else if (size_tflag)
1983 numprinted = sprintf(numberresult, fmt,
1984 va_arg(count, size_t));
1985 else
1986 numprinted = sprintf(numberresult, fmt,
1987 va_arg(count, unsigned int));
1988 n += numprinted;
1989 numberresult += (numprinted + 1);
1990 assert(*(numberresult - 1) == '\0');
1991 assert(*(numberresult - 2) != '\0');
1992 assert(numprinted >= 0);
1993 assert(numberresult <= numberresults + numbersize);
1994 break;
1995 case 'x':
1996 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1997 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1998 n += numprinted;
1999 numberresult += (numprinted + 1);
2000 assert(*(numberresult - 1) == '\0');
2001 assert(*(numberresult - 2) != '\0');
2002 assert(numprinted >= 0);
2003 assert(numberresult <= numberresults + numbersize);
2004 break;
2005 case 'p':
2006 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2007 /* %p is ill-defined: ensure leading 0x. */
2008 if (numberresult[1] == 'X')
2009 numberresult[1] = 'x';
2010 else if (numberresult[1] != 'x') {
2011 memmove(numberresult + 2, numberresult,
2012 strlen(numberresult) + 1);
2013 numberresult[0] = '0';
2014 numberresult[1] = 'x';
2015 numprinted += 2;
2016 }
2017 n += numprinted;
2018 numberresult += (numprinted + 1);
2019 assert(*(numberresult - 1) == '\0');
2020 assert(*(numberresult - 2) != '\0');
2021 assert(numprinted >= 0);
2022 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002023 break;
2024 case 's':
2025 {
2026 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002027 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002028 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2029 if (!str)
2030 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 /* since PyUnicode_DecodeUTF8 returns already flexible
2032 unicode objects, there is no need to call ready on them */
2033 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002034 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002036 /* Remember the str and switch to the next slot */
2037 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002038 break;
2039 }
2040 case 'U':
2041 {
2042 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002043 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 if (PyUnicode_READY(obj) == -1)
2045 goto fail;
2046 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002047 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002049 break;
2050 }
2051 case 'V':
2052 {
2053 PyObject *obj = va_arg(count, PyObject *);
2054 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002055 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002057 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002058 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 if (PyUnicode_READY(obj) == -1)
2060 goto fail;
2061 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002062 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002064 *callresult++ = NULL;
2065 }
2066 else {
2067 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2068 if (!str_obj)
2069 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002071 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002073 *callresult++ = str_obj;
2074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 break;
2076 }
2077 case 'S':
2078 {
2079 PyObject *obj = va_arg(count, PyObject *);
2080 PyObject *str;
2081 assert(obj);
2082 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002084 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002086 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 /* Remember the str and switch to the next slot */
2089 *callresult++ = str;
2090 break;
2091 }
2092 case 'R':
2093 {
2094 PyObject *obj = va_arg(count, PyObject *);
2095 PyObject *repr;
2096 assert(obj);
2097 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002099 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002101 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 /* Remember the repr and switch to the next slot */
2104 *callresult++ = repr;
2105 break;
2106 }
2107 case 'A':
2108 {
2109 PyObject *obj = va_arg(count, PyObject *);
2110 PyObject *ascii;
2111 assert(obj);
2112 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002116 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 /* Remember the repr and switch to the next slot */
2119 *callresult++ = ascii;
2120 break;
2121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002122 default:
2123 /* if we stumble upon an unknown
2124 formatting code, copy the rest of
2125 the format string to the output
2126 string. (we cannot just skip the
2127 code, since there's no way to know
2128 what's in the argument list) */
2129 n += strlen(p);
2130 goto expand;
2131 }
2132 } else
2133 n++;
2134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002135 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002136 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002138 we don't have to resize the string.
2139 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 if (!string)
2142 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 kind = PyUnicode_KIND(string);
2144 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002145 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002149 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002150 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002151
2152 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2154 /* checking for == because the last argument could be a empty
2155 string, which causes i to point to end, the assert at the end of
2156 the loop */
2157 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002158
Benjamin Peterson14339b62009-01-31 16:36:08 +00002159 switch (*f) {
2160 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002161 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 const int ordinal = va_arg(vargs, int);
2163 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002164 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002165 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002166 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002168 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 case 'p':
2171 /* unused, since we already have the result */
2172 if (*f == 'p')
2173 (void) va_arg(vargs, void *);
2174 else
2175 (void) va_arg(vargs, int);
2176 /* extract the result from numberresults and append. */
2177 for (; *numberresult; ++i, ++numberresult)
2178 PyUnicode_WRITE(kind, data, i, *numberresult);
2179 /* skip over the separating '\0' */
2180 assert(*numberresult == '\0');
2181 numberresult++;
2182 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 break;
2184 case 's':
2185 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002186 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002188 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 size = PyUnicode_GET_LENGTH(*callresult);
2190 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002191 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2192 *callresult, 0,
2193 size) < 0)
2194 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002196 /* We're done with the unicode()/repr() => forget it */
2197 Py_DECREF(*callresult);
2198 /* switch to next unicode()/repr() result */
2199 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 break;
2201 }
2202 case 'U':
2203 {
2204 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 Py_ssize_t size;
2206 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2207 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002208 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2209 obj, 0,
2210 size) < 0)
2211 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 break;
2214 }
2215 case 'V':
2216 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002219 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 size = PyUnicode_GET_LENGTH(obj);
2222 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002223 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2224 obj, 0,
2225 size) < 0)
2226 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 size = PyUnicode_GET_LENGTH(*callresult);
2230 assert(PyUnicode_KIND(*callresult) <=
2231 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002232 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2233 *callresult,
2234 0, size) < 0)
2235 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002237 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002238 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002239 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 break;
2241 }
2242 case 'S':
2243 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002244 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 /* unused, since we already have the result */
2247 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002249 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2250 *callresult, 0,
2251 PyUnicode_GET_LENGTH(*callresult)) < 0)
2252 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002254 /* We're done with the unicode()/repr() => forget it */
2255 Py_DECREF(*callresult);
2256 /* switch to next unicode()/repr() result */
2257 ++callresult;
2258 break;
2259 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002262 break;
2263 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 for (; *p; ++p, ++i)
2265 PyUnicode_WRITE(kind, data, i, *p);
2266 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 goto end;
2268 }
Victor Stinner1205f272010-09-11 00:54:47 +00002269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 else {
2271 assert(i < PyUnicode_GET_LENGTH(string));
2272 PyUnicode_WRITE(kind, data, i++, *f);
2273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Benjamin Peterson29060642009-01-31 22:14:21 +00002277 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 if (callresults)
2279 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 if (numberresults)
2281 PyObject_Free(numberresults);
2282 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 if (callresults) {
2285 PyObject **callresult2 = callresults;
2286 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002287 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002288 ++callresult2;
2289 }
2290 PyObject_Free(callresults);
2291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 if (numberresults)
2293 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002295}
2296
Walter Dörwaldd2034312007-05-18 16:29:38 +00002297PyObject *
2298PyUnicode_FromFormat(const char *format, ...)
2299{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 PyObject* ret;
2301 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002302
2303#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002304 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002305#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002307#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 ret = PyUnicode_FromFormatV(format, vargs);
2309 va_end(vargs);
2310 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002311}
2312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313#ifdef HAVE_WCHAR_H
2314
Victor Stinner5593d8a2010-10-02 11:11:27 +00002315/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2316 convert a Unicode object to a wide character string.
2317
Victor Stinnerd88d9832011-09-06 02:00:05 +02002318 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002319 character) required to convert the unicode object. Ignore size argument.
2320
Victor Stinnerd88d9832011-09-06 02:00:05 +02002321 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002322 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002323 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002324static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002325unicode_aswidechar(PyUnicodeObject *unicode,
2326 wchar_t *w,
2327 Py_ssize_t size)
2328{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002329 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330 const wchar_t *wstr;
2331
2332 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2333 if (wstr == NULL)
2334 return -1;
2335
Victor Stinner5593d8a2010-10-02 11:11:27 +00002336 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002337 if (size > res)
2338 size = res + 1;
2339 else
2340 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002342 return res;
2343 }
2344 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002346}
2347
2348Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002349PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002350 wchar_t *w,
2351 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352{
2353 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002354 PyErr_BadInternalCall();
2355 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002357 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358}
2359
Victor Stinner137c34c2010-09-29 10:25:54 +00002360wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002361PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002362 Py_ssize_t *size)
2363{
2364 wchar_t* buffer;
2365 Py_ssize_t buflen;
2366
2367 if (unicode == NULL) {
2368 PyErr_BadInternalCall();
2369 return NULL;
2370 }
2371
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002372 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 if (buflen == -1)
2374 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002375 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002376 PyErr_NoMemory();
2377 return NULL;
2378 }
2379
Victor Stinner137c34c2010-09-29 10:25:54 +00002380 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2381 if (buffer == NULL) {
2382 PyErr_NoMemory();
2383 return NULL;
2384 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002385 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 if (buflen == -1)
2387 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002388 if (size != NULL)
2389 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002390 return buffer;
2391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394
Alexander Belopolsky40018472011-02-26 01:02:56 +00002395PyObject *
2396PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002399 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 PyErr_SetString(PyExc_ValueError,
2401 "chr() arg not in range(0x110000)");
2402 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002403 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 if (ordinal < 256)
2406 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 v = PyUnicode_New(1, ordinal);
2409 if (v == NULL)
2410 return NULL;
2411 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2412 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002413}
2414
Alexander Belopolsky40018472011-02-26 01:02:56 +00002415PyObject *
2416PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002418 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002420 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002421 if (PyUnicode_READY(obj))
2422 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002423 Py_INCREF(obj);
2424 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002425 }
2426 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 /* For a Unicode subtype that's not a Unicode object,
2428 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002429 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002430 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002431 PyErr_Format(PyExc_TypeError,
2432 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002433 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002434 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002435}
2436
Alexander Belopolsky40018472011-02-26 01:02:56 +00002437PyObject *
2438PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002439 const char *encoding,
2440 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002441{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002442 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002443 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002446 PyErr_BadInternalCall();
2447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002450 /* Decoding bytes objects is the most common case and should be fast */
2451 if (PyBytes_Check(obj)) {
2452 if (PyBytes_GET_SIZE(obj) == 0) {
2453 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002454 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002455 }
2456 else {
2457 v = PyUnicode_Decode(
2458 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2459 encoding, errors);
2460 }
2461 return v;
2462 }
2463
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002464 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 PyErr_SetString(PyExc_TypeError,
2466 "decoding str is not supported");
2467 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002468 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002469
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002470 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2471 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2472 PyErr_Format(PyExc_TypeError,
2473 "coercing to str: need bytes, bytearray "
2474 "or buffer-like object, %.80s found",
2475 Py_TYPE(obj)->tp_name);
2476 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002477 }
Tim Petersced69f82003-09-16 20:30:58 +00002478
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002479 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002481 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
Tim Petersced69f82003-09-16 20:30:58 +00002483 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002484 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002485
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002486 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002487 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488}
2489
Victor Stinner600d3be2010-06-10 12:00:55 +00002490/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002491 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2492 1 on success. */
2493static int
2494normalize_encoding(const char *encoding,
2495 char *lower,
2496 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002498 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002499 char *l;
2500 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002502 e = encoding;
2503 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002504 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002505 while (*e) {
2506 if (l == l_end)
2507 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002508 if (Py_ISUPPER(*e)) {
2509 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002510 }
2511 else if (*e == '_') {
2512 *l++ = '-';
2513 e++;
2514 }
2515 else {
2516 *l++ = *e++;
2517 }
2518 }
2519 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002520 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002521}
2522
Alexander Belopolsky40018472011-02-26 01:02:56 +00002523PyObject *
2524PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002525 Py_ssize_t size,
2526 const char *encoding,
2527 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002528{
2529 PyObject *buffer = NULL, *unicode;
2530 Py_buffer info;
2531 char lower[11]; /* Enough for any encoding shortcut */
2532
2533 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002535
2536 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002537 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002538 if ((strcmp(lower, "utf-8") == 0) ||
2539 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002540 return PyUnicode_DecodeUTF8(s, size, errors);
2541 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002542 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002543 (strcmp(lower, "iso-8859-1") == 0))
2544 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002545#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002546 else if (strcmp(lower, "mbcs") == 0)
2547 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002548#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002549 else if (strcmp(lower, "ascii") == 0)
2550 return PyUnicode_DecodeASCII(s, size, errors);
2551 else if (strcmp(lower, "utf-16") == 0)
2552 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2553 else if (strcmp(lower, "utf-32") == 0)
2554 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002558 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002559 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002560 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002561 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 if (buffer == NULL)
2563 goto onError;
2564 unicode = PyCodec_Decode(buffer, encoding, errors);
2565 if (unicode == NULL)
2566 goto onError;
2567 if (!PyUnicode_Check(unicode)) {
2568 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002569 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002570 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 Py_DECREF(unicode);
2572 goto onError;
2573 }
2574 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002575 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 Py_DECREF(unicode);
2577 return NULL;
2578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002580
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 Py_XDECREF(buffer);
2583 return NULL;
2584}
2585
Alexander Belopolsky40018472011-02-26 01:02:56 +00002586PyObject *
2587PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002588 const char *encoding,
2589 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002590{
2591 PyObject *v;
2592
2593 if (!PyUnicode_Check(unicode)) {
2594 PyErr_BadArgument();
2595 goto onError;
2596 }
2597
2598 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002600
2601 /* Decode via the codec registry */
2602 v = PyCodec_Decode(unicode, encoding, errors);
2603 if (v == NULL)
2604 goto onError;
2605 return v;
2606
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002608 return NULL;
2609}
2610
Alexander Belopolsky40018472011-02-26 01:02:56 +00002611PyObject *
2612PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002613 const char *encoding,
2614 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002615{
2616 PyObject *v;
2617
2618 if (!PyUnicode_Check(unicode)) {
2619 PyErr_BadArgument();
2620 goto onError;
2621 }
2622
2623 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002624 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002625
2626 /* Decode via the codec registry */
2627 v = PyCodec_Decode(unicode, encoding, errors);
2628 if (v == NULL)
2629 goto onError;
2630 if (!PyUnicode_Check(v)) {
2631 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002632 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002633 Py_TYPE(v)->tp_name);
2634 Py_DECREF(v);
2635 goto onError;
2636 }
2637 return v;
2638
Benjamin Peterson29060642009-01-31 22:14:21 +00002639 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002640 return NULL;
2641}
2642
Alexander Belopolsky40018472011-02-26 01:02:56 +00002643PyObject *
2644PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002645 Py_ssize_t size,
2646 const char *encoding,
2647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648{
2649 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002650
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 unicode = PyUnicode_FromUnicode(s, size);
2652 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2655 Py_DECREF(unicode);
2656 return v;
2657}
2658
Alexander Belopolsky40018472011-02-26 01:02:56 +00002659PyObject *
2660PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002661 const char *encoding,
2662 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002663{
2664 PyObject *v;
2665
2666 if (!PyUnicode_Check(unicode)) {
2667 PyErr_BadArgument();
2668 goto onError;
2669 }
2670
2671 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002673
2674 /* Encode via the codec registry */
2675 v = PyCodec_Encode(unicode, encoding, errors);
2676 if (v == NULL)
2677 goto onError;
2678 return v;
2679
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002681 return NULL;
2682}
2683
Victor Stinnerad158722010-10-27 00:25:46 +00002684PyObject *
2685PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002686{
Victor Stinner99b95382011-07-04 14:23:54 +02002687#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002688 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2689 PyUnicode_GET_SIZE(unicode),
2690 NULL);
2691#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002693#else
Victor Stinner793b5312011-04-27 00:24:21 +02002694 PyInterpreterState *interp = PyThreadState_GET()->interp;
2695 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2696 cannot use it to encode and decode filenames before it is loaded. Load
2697 the Python codec requires to encode at least its own filename. Use the C
2698 version of the locale codec until the codec registry is initialized and
2699 the Python codec is loaded.
2700
2701 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2702 cannot only rely on it: check also interp->fscodec_initialized for
2703 subinterpreters. */
2704 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002705 return PyUnicode_AsEncodedString(unicode,
2706 Py_FileSystemDefaultEncoding,
2707 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002708 }
2709 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002710 /* locale encoding with surrogateescape */
2711 wchar_t *wchar;
2712 char *bytes;
2713 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002714 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002715
2716 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2717 if (wchar == NULL)
2718 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002719 bytes = _Py_wchar2char(wchar, &error_pos);
2720 if (bytes == NULL) {
2721 if (error_pos != (size_t)-1) {
2722 char *errmsg = strerror(errno);
2723 PyObject *exc = NULL;
2724 if (errmsg == NULL)
2725 errmsg = "Py_wchar2char() failed";
2726 raise_encode_exception(&exc,
2727 "filesystemencoding",
2728 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2729 error_pos, error_pos+1,
2730 errmsg);
2731 Py_XDECREF(exc);
2732 }
2733 else
2734 PyErr_NoMemory();
2735 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002736 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002737 }
2738 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002739
2740 bytes_obj = PyBytes_FromString(bytes);
2741 PyMem_Free(bytes);
2742 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002743 }
Victor Stinnerad158722010-10-27 00:25:46 +00002744#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002745}
2746
Alexander Belopolsky40018472011-02-26 01:02:56 +00002747PyObject *
2748PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002749 const char *encoding,
2750 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
2752 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002753 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002754
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 if (!PyUnicode_Check(unicode)) {
2756 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Fred Drakee4315f52000-05-09 19:53:39 +00002759
Victor Stinner2f283c22011-03-02 01:21:46 +00002760 if (encoding == NULL) {
2761 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002762 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002763 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002765 }
Fred Drakee4315f52000-05-09 19:53:39 +00002766
2767 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002768 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002769 if ((strcmp(lower, "utf-8") == 0) ||
2770 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002771 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002772 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002774 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002776 }
Victor Stinner37296e82010-06-10 13:36:23 +00002777 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002778 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002779 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002781#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002782 else if (strcmp(lower, "mbcs") == 0)
2783 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2784 PyUnicode_GET_SIZE(unicode),
2785 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002786#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002787 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790
2791 /* Encode via the codec registry */
2792 v = PyCodec_Encode(unicode, encoding, errors);
2793 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002794 return NULL;
2795
2796 /* The normal path */
2797 if (PyBytes_Check(v))
2798 return v;
2799
2800 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002801 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002802 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002803 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002804
2805 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2806 "encoder %s returned bytearray instead of bytes",
2807 encoding);
2808 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002809 Py_DECREF(v);
2810 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002811 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002812
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002813 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2814 Py_DECREF(v);
2815 return b;
2816 }
2817
2818 PyErr_Format(PyExc_TypeError,
2819 "encoder did not return a bytes object (type=%.400s)",
2820 Py_TYPE(v)->tp_name);
2821 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002822 return NULL;
2823}
2824
Alexander Belopolsky40018472011-02-26 01:02:56 +00002825PyObject *
2826PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002827 const char *encoding,
2828 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002829{
2830 PyObject *v;
2831
2832 if (!PyUnicode_Check(unicode)) {
2833 PyErr_BadArgument();
2834 goto onError;
2835 }
2836
2837 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002839
2840 /* Encode via the codec registry */
2841 v = PyCodec_Encode(unicode, encoding, errors);
2842 if (v == NULL)
2843 goto onError;
2844 if (!PyUnicode_Check(v)) {
2845 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002846 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002847 Py_TYPE(v)->tp_name);
2848 Py_DECREF(v);
2849 goto onError;
2850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002852
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 return NULL;
2855}
2856
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002857PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002858PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002859 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002860 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2861}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002862
Christian Heimes5894ba72007-11-04 11:43:14 +00002863PyObject*
2864PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2865{
Victor Stinner99b95382011-07-04 14:23:54 +02002866#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002867 return PyUnicode_DecodeMBCS(s, size, NULL);
2868#elif defined(__APPLE__)
2869 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2870#else
Victor Stinner793b5312011-04-27 00:24:21 +02002871 PyInterpreterState *interp = PyThreadState_GET()->interp;
2872 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2873 cannot use it to encode and decode filenames before it is loaded. Load
2874 the Python codec requires to encode at least its own filename. Use the C
2875 version of the locale codec until the codec registry is initialized and
2876 the Python codec is loaded.
2877
2878 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2879 cannot only rely on it: check also interp->fscodec_initialized for
2880 subinterpreters. */
2881 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002882 return PyUnicode_Decode(s, size,
2883 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002884 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002885 }
2886 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002887 /* locale encoding with surrogateescape */
2888 wchar_t *wchar;
2889 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002890 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002891
2892 if (s[size] != '\0' || size != strlen(s)) {
2893 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2894 return NULL;
2895 }
2896
Victor Stinner168e1172010-10-16 23:16:16 +00002897 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002898 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002899 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002900
Victor Stinner168e1172010-10-16 23:16:16 +00002901 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002902 PyMem_Free(wchar);
2903 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002904 }
Victor Stinnerad158722010-10-27 00:25:46 +00002905#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002906}
2907
Martin v. Löwis011e8422009-05-05 04:43:17 +00002908
2909int
2910PyUnicode_FSConverter(PyObject* arg, void* addr)
2911{
2912 PyObject *output = NULL;
2913 Py_ssize_t size;
2914 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002915 if (arg == NULL) {
2916 Py_DECREF(*(PyObject**)addr);
2917 return 1;
2918 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002919 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002920 output = arg;
2921 Py_INCREF(output);
2922 }
2923 else {
2924 arg = PyUnicode_FromObject(arg);
2925 if (!arg)
2926 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002927 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002928 Py_DECREF(arg);
2929 if (!output)
2930 return 0;
2931 if (!PyBytes_Check(output)) {
2932 Py_DECREF(output);
2933 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2934 return 0;
2935 }
2936 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002937 size = PyBytes_GET_SIZE(output);
2938 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002939 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002940 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002941 Py_DECREF(output);
2942 return 0;
2943 }
2944 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002945 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002946}
2947
2948
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002949int
2950PyUnicode_FSDecoder(PyObject* arg, void* addr)
2951{
2952 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002953 if (arg == NULL) {
2954 Py_DECREF(*(PyObject**)addr);
2955 return 1;
2956 }
2957 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 if (PyUnicode_READY(arg))
2959 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002960 output = arg;
2961 Py_INCREF(output);
2962 }
2963 else {
2964 arg = PyBytes_FromObject(arg);
2965 if (!arg)
2966 return 0;
2967 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2968 PyBytes_GET_SIZE(arg));
2969 Py_DECREF(arg);
2970 if (!output)
2971 return 0;
2972 if (!PyUnicode_Check(output)) {
2973 Py_DECREF(output);
2974 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2975 return 0;
2976 }
2977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002978 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2979 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002980 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2981 Py_DECREF(output);
2982 return 0;
2983 }
2984 *(PyObject**)addr = output;
2985 return Py_CLEANUP_SUPPORTED;
2986}
2987
2988
Martin v. Löwis5b222132007-06-10 09:51:05 +00002989char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002990PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002991{
Christian Heimesf3863112007-11-22 07:46:41 +00002992 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002993 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2994
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002995 if (!PyUnicode_Check(unicode)) {
2996 PyErr_BadArgument();
2997 return NULL;
2998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003000 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003001
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003002 if (PyUnicode_UTF8(unicode) == NULL) {
3003 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003004 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3005 if (bytes == NULL)
3006 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003007 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3008 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003009 Py_DECREF(bytes);
3010 return NULL;
3011 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003012 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3013 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 Py_DECREF(bytes);
3015 }
3016
3017 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003018 *psize = PyUnicode_UTF8_LENGTH(unicode);
3019 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003020}
3021
3022char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003023PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003025 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3026}
3027
3028#ifdef Py_DEBUG
3029int unicode_as_unicode_calls = 0;
3030#endif
3031
3032
3033Py_UNICODE *
3034PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3035{
3036 PyUnicodeObject *u;
3037 const unsigned char *one_byte;
3038#if SIZEOF_WCHAR_T == 4
3039 const Py_UCS2 *two_bytes;
3040#else
3041 const Py_UCS4 *four_bytes;
3042 const Py_UCS4 *ucs4_end;
3043 Py_ssize_t num_surrogates;
3044#endif
3045 wchar_t *w;
3046 wchar_t *wchar_end;
3047
3048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_BadArgument();
3050 return NULL;
3051 }
3052 u = (PyUnicodeObject*)unicode;
3053 if (_PyUnicode_WSTR(u) == NULL) {
3054 /* Non-ASCII compact unicode object */
3055 assert(_PyUnicode_KIND(u) != 0);
3056 assert(PyUnicode_IS_READY(u));
3057
3058#ifdef Py_DEBUG
3059 ++unicode_as_unicode_calls;
3060#endif
3061
3062 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3063#if SIZEOF_WCHAR_T == 2
3064 four_bytes = PyUnicode_4BYTE_DATA(u);
3065 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3066 num_surrogates = 0;
3067
3068 for (; four_bytes < ucs4_end; ++four_bytes) {
3069 if (*four_bytes > 0xFFFF)
3070 ++num_surrogates;
3071 }
3072
3073 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3074 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3075 if (!_PyUnicode_WSTR(u)) {
3076 PyErr_NoMemory();
3077 return NULL;
3078 }
3079 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3080
3081 w = _PyUnicode_WSTR(u);
3082 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3083 four_bytes = PyUnicode_4BYTE_DATA(u);
3084 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3085 if (*four_bytes > 0xFFFF) {
3086 /* encode surrogate pair in this case */
3087 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3088 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3089 }
3090 else
3091 *w = *four_bytes;
3092
3093 if (w > wchar_end) {
3094 assert(0 && "Miscalculated string end");
3095 }
3096 }
3097 *w = 0;
3098#else
3099 /* sizeof(wchar_t) == 4 */
3100 Py_FatalError("Impossible unicode object state, wstr and str "
3101 "should share memory already.");
3102 return NULL;
3103#endif
3104 }
3105 else {
3106 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3107 (_PyUnicode_LENGTH(u) + 1));
3108 if (!_PyUnicode_WSTR(u)) {
3109 PyErr_NoMemory();
3110 return NULL;
3111 }
3112 if (!PyUnicode_IS_COMPACT_ASCII(u))
3113 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3114 w = _PyUnicode_WSTR(u);
3115 wchar_end = w + _PyUnicode_LENGTH(u);
3116
3117 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3118 one_byte = PyUnicode_1BYTE_DATA(u);
3119 for (; w < wchar_end; ++one_byte, ++w)
3120 *w = *one_byte;
3121 /* null-terminate the wstr */
3122 *w = 0;
3123 }
3124 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3125#if SIZEOF_WCHAR_T == 4
3126 two_bytes = PyUnicode_2BYTE_DATA(u);
3127 for (; w < wchar_end; ++two_bytes, ++w)
3128 *w = *two_bytes;
3129 /* null-terminate the wstr */
3130 *w = 0;
3131#else
3132 /* sizeof(wchar_t) == 2 */
3133 PyObject_FREE(_PyUnicode_WSTR(u));
3134 _PyUnicode_WSTR(u) = NULL;
3135 Py_FatalError("Impossible unicode object state, wstr "
3136 "and str should share memory already.");
3137 return NULL;
3138#endif
3139 }
3140 else {
3141 assert(0 && "This should never happen.");
3142 }
3143 }
3144 }
3145 if (size != NULL)
3146 *size = PyUnicode_WSTR_LENGTH(u);
3147 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150Py_UNICODE *
3151PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003153 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154}
3155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003156
Alexander Belopolsky40018472011-02-26 01:02:56 +00003157Py_ssize_t
3158PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159{
3160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
3162 goto onError;
3163 }
3164 return PyUnicode_GET_SIZE(unicode);
3165
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 return -1;
3168}
3169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003170Py_ssize_t
3171PyUnicode_GetLength(PyObject *unicode)
3172{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003173 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003174 PyErr_BadArgument();
3175 return -1;
3176 }
3177
3178 return PyUnicode_GET_LENGTH(unicode);
3179}
3180
3181Py_UCS4
3182PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3183{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003184 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3185 PyErr_BadArgument();
3186 return (Py_UCS4)-1;
3187 }
3188 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3189 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003190 return (Py_UCS4)-1;
3191 }
3192 return PyUnicode_READ_CHAR(unicode, index);
3193}
3194
3195int
3196PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3197{
3198 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003199 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003200 return -1;
3201 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003202 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3203 PyErr_SetString(PyExc_IndexError, "string index out of range");
3204 return -1;
3205 }
3206 if (_PyUnicode_Dirty(unicode))
3207 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003208 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3209 index, ch);
3210 return 0;
3211}
3212
Alexander Belopolsky40018472011-02-26 01:02:56 +00003213const char *
3214PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003215{
Victor Stinner42cb4622010-09-01 19:39:01 +00003216 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003217}
3218
Victor Stinner554f3f02010-06-16 23:33:54 +00003219/* create or adjust a UnicodeDecodeError */
3220static void
3221make_decode_exception(PyObject **exceptionObject,
3222 const char *encoding,
3223 const char *input, Py_ssize_t length,
3224 Py_ssize_t startpos, Py_ssize_t endpos,
3225 const char *reason)
3226{
3227 if (*exceptionObject == NULL) {
3228 *exceptionObject = PyUnicodeDecodeError_Create(
3229 encoding, input, length, startpos, endpos, reason);
3230 }
3231 else {
3232 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3233 goto onError;
3234 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3235 goto onError;
3236 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3237 goto onError;
3238 }
3239 return;
3240
3241onError:
3242 Py_DECREF(*exceptionObject);
3243 *exceptionObject = NULL;
3244}
3245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246/* error handling callback helper:
3247 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003248 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 and adjust various state variables.
3250 return 0 on success, -1 on error
3251*/
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253static int
3254unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003255 const char *encoding, const char *reason,
3256 const char **input, const char **inend, Py_ssize_t *startinpos,
3257 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3258 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003260 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261
3262 PyObject *restuple = NULL;
3263 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003264 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003265 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003266 Py_ssize_t requiredsize;
3267 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003268 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003269 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 int res = -1;
3272
3273 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 *errorHandler = PyCodec_LookupError(errors);
3275 if (*errorHandler == NULL)
3276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 }
3278
Victor Stinner554f3f02010-06-16 23:33:54 +00003279 make_decode_exception(exceptionObject,
3280 encoding,
3281 *input, *inend - *input,
3282 *startinpos, *endinpos,
3283 reason);
3284 if (*exceptionObject == NULL)
3285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286
3287 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3288 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003291 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 }
3294 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003296
3297 /* Copy back the bytes variables, which might have been modified by the
3298 callback */
3299 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3300 if (!inputobj)
3301 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003302 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003304 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003305 *input = PyBytes_AS_STRING(inputobj);
3306 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003307 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003308 /* we can DECREF safely, as the exception has another reference,
3309 so the object won't go away. */
3310 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003314 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003315 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3316 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318
3319 /* need more space? (at least enough for what we
3320 have+the replacement+the rest of the string (starting
3321 at the new input position), so we won't have to check space
3322 when there are no errors in the rest of the string) */
3323 repptr = PyUnicode_AS_UNICODE(repunicode);
3324 repsize = PyUnicode_GET_SIZE(repunicode);
3325 requiredsize = *outpos + repsize + insize-newpos;
3326 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 if (requiredsize<2*outsize)
3328 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003329 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 goto onError;
3331 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 }
3333 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003334 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 Py_UNICODE_COPY(*outptr, repptr, repsize);
3336 *outptr += repsize;
3337 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339 /* we made it! */
3340 res = 0;
3341
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 Py_XDECREF(restuple);
3344 return res;
3345}
3346
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003347/* --- UTF-7 Codec -------------------------------------------------------- */
3348
Antoine Pitrou244651a2009-05-04 18:56:13 +00003349/* See RFC2152 for details. We encode conservatively and decode liberally. */
3350
3351/* Three simple macros defining base-64. */
3352
3353/* Is c a base-64 character? */
3354
3355#define IS_BASE64(c) \
3356 (((c) >= 'A' && (c) <= 'Z') || \
3357 ((c) >= 'a' && (c) <= 'z') || \
3358 ((c) >= '0' && (c) <= '9') || \
3359 (c) == '+' || (c) == '/')
3360
3361/* given that c is a base-64 character, what is its base-64 value? */
3362
3363#define FROM_BASE64(c) \
3364 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3365 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3366 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3367 (c) == '+' ? 62 : 63)
3368
3369/* What is the base-64 character of the bottom 6 bits of n? */
3370
3371#define TO_BASE64(n) \
3372 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3373
3374/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3375 * decoded as itself. We are permissive on decoding; the only ASCII
3376 * byte not decoding to itself is the + which begins a base64
3377 * string. */
3378
3379#define DECODE_DIRECT(c) \
3380 ((c) <= 127 && (c) != '+')
3381
3382/* The UTF-7 encoder treats ASCII characters differently according to
3383 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3384 * the above). See RFC2152. This array identifies these different
3385 * sets:
3386 * 0 : "Set D"
3387 * alphanumeric and '(),-./:?
3388 * 1 : "Set O"
3389 * !"#$%&*;<=>@[]^_`{|}
3390 * 2 : "whitespace"
3391 * ht nl cr sp
3392 * 3 : special (must be base64 encoded)
3393 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3394 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003395
Tim Petersced69f82003-09-16 20:30:58 +00003396static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003397char utf7_category[128] = {
3398/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3399 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3400/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3401 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3402/* sp ! " # $ % & ' ( ) * + , - . / */
3403 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3404/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3405 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3406/* @ A B C D E F G H I J K L M N O */
3407 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3408/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3410/* ` a b c d e f g h i j k l m n o */
3411 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3412/* p q r s t u v w x y z { | } ~ del */
3413 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414};
3415
Antoine Pitrou244651a2009-05-04 18:56:13 +00003416/* ENCODE_DIRECT: this character should be encoded as itself. The
3417 * answer depends on whether we are encoding set O as itself, and also
3418 * on whether we are encoding whitespace as itself. RFC2152 makes it
3419 * clear that the answers to these questions vary between
3420 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003421
Antoine Pitrou244651a2009-05-04 18:56:13 +00003422#define ENCODE_DIRECT(c, directO, directWS) \
3423 ((c) < 128 && (c) > 0 && \
3424 ((utf7_category[(c)] == 0) || \
3425 (directWS && (utf7_category[(c)] == 2)) || \
3426 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003427
Alexander Belopolsky40018472011-02-26 01:02:56 +00003428PyObject *
3429PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003430 Py_ssize_t size,
3431 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003432{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003433 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3434}
3435
Antoine Pitrou244651a2009-05-04 18:56:13 +00003436/* The decoder. The only state we preserve is our read position,
3437 * i.e. how many characters we have consumed. So if we end in the
3438 * middle of a shift sequence we have to back off the read position
3439 * and the output to the beginning of the sequence, otherwise we lose
3440 * all the shift state (seen bits, number of bits seen, high
3441 * surrogate). */
3442
Alexander Belopolsky40018472011-02-26 01:02:56 +00003443PyObject *
3444PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003445 Py_ssize_t size,
3446 const char *errors,
3447 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003450 Py_ssize_t startinpos;
3451 Py_ssize_t endinpos;
3452 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003453 const char *e;
3454 PyUnicodeObject *unicode;
3455 Py_UNICODE *p;
3456 const char *errmsg = "";
3457 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003458 Py_UNICODE *shiftOutStart;
3459 unsigned int base64bits = 0;
3460 unsigned long base64buffer = 0;
3461 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 PyObject *errorHandler = NULL;
3463 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003464
3465 unicode = _PyUnicode_New(size);
3466 if (!unicode)
3467 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003468 if (size == 0) {
3469 if (consumed)
3470 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003471 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003475 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003476 e = s + size;
3477
3478 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003480 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003481 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003482
Antoine Pitrou244651a2009-05-04 18:56:13 +00003483 if (inShift) { /* in a base-64 section */
3484 if (IS_BASE64(ch)) { /* consume a base-64 character */
3485 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3486 base64bits += 6;
3487 s++;
3488 if (base64bits >= 16) {
3489 /* we have enough bits for a UTF-16 value */
3490 Py_UNICODE outCh = (Py_UNICODE)
3491 (base64buffer >> (base64bits-16));
3492 base64bits -= 16;
3493 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3494 if (surrogate) {
3495 /* expecting a second surrogate */
3496 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3497#ifdef Py_UNICODE_WIDE
3498 *p++ = (((surrogate & 0x3FF)<<10)
3499 | (outCh & 0x3FF)) + 0x10000;
3500#else
3501 *p++ = surrogate;
3502 *p++ = outCh;
3503#endif
3504 surrogate = 0;
3505 }
3506 else {
3507 surrogate = 0;
3508 errmsg = "second surrogate missing";
3509 goto utf7Error;
3510 }
3511 }
3512 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3513 /* first surrogate */
3514 surrogate = outCh;
3515 }
3516 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3517 errmsg = "unexpected second surrogate";
3518 goto utf7Error;
3519 }
3520 else {
3521 *p++ = outCh;
3522 }
3523 }
3524 }
3525 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003526 inShift = 0;
3527 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003528 if (surrogate) {
3529 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003530 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003531 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003532 if (base64bits > 0) { /* left-over bits */
3533 if (base64bits >= 6) {
3534 /* We've seen at least one base-64 character */
3535 errmsg = "partial character in shift sequence";
3536 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003538 else {
3539 /* Some bits remain; they should be zero */
3540 if (base64buffer != 0) {
3541 errmsg = "non-zero padding bits in shift sequence";
3542 goto utf7Error;
3543 }
3544 }
3545 }
3546 if (ch != '-') {
3547 /* '-' is absorbed; other terminating
3548 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003549 *p++ = ch;
3550 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003551 }
3552 }
3553 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003555 s++; /* consume '+' */
3556 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003557 s++;
3558 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003559 }
3560 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003562 shiftOutStart = p;
3563 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003564 }
3565 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003566 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567 *p++ = ch;
3568 s++;
3569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003570 else {
3571 startinpos = s-starts;
3572 s++;
3573 errmsg = "unexpected special character";
3574 goto utf7Error;
3575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003576 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003577utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 outpos = p-PyUnicode_AS_UNICODE(unicode);
3579 endinpos = s-starts;
3580 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 errors, &errorHandler,
3582 "utf7", errmsg,
3583 &starts, &e, &startinpos, &endinpos, &exc, &s,
3584 &unicode, &outpos, &p))
3585 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003586 }
3587
Antoine Pitrou244651a2009-05-04 18:56:13 +00003588 /* end of string */
3589
3590 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3591 /* if we're in an inconsistent state, that's an error */
3592 if (surrogate ||
3593 (base64bits >= 6) ||
3594 (base64bits > 0 && base64buffer != 0)) {
3595 outpos = p-PyUnicode_AS_UNICODE(unicode);
3596 endinpos = size;
3597 if (unicode_decode_call_errorhandler(
3598 errors, &errorHandler,
3599 "utf7", "unterminated shift sequence",
3600 &starts, &e, &startinpos, &endinpos, &exc, &s,
3601 &unicode, &outpos, &p))
3602 goto onError;
3603 if (s < e)
3604 goto restart;
3605 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607
3608 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003609 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003610 if (inShift) {
3611 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003612 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613 }
3614 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003615 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003616 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003617 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003618
Victor Stinnerfe226c02011-10-03 03:52:20 +02003619 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 goto onError;
3621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003624 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003625 Py_DECREF(unicode);
3626 return NULL;
3627 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003628 return (PyObject *)unicode;
3629
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 Py_XDECREF(errorHandler);
3632 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003633 Py_DECREF(unicode);
3634 return NULL;
3635}
3636
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638PyObject *
3639PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 Py_ssize_t size,
3641 int base64SetO,
3642 int base64WhiteSpace,
3643 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003644{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003645 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003646 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003647 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003648 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003649 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003650 unsigned int base64bits = 0;
3651 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003652 char * out;
3653 char * start;
3654
3655 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003657
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003658 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003659 return PyErr_NoMemory();
3660
Antoine Pitrou244651a2009-05-04 18:56:13 +00003661 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003662 if (v == NULL)
3663 return NULL;
3664
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003665 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003666 for (;i < size; ++i) {
3667 Py_UNICODE ch = s[i];
3668
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 if (inShift) {
3670 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3671 /* shifting out */
3672 if (base64bits) { /* output remaining bits */
3673 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3674 base64buffer = 0;
3675 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003676 }
3677 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003678 /* Characters not in the BASE64 set implicitly unshift the sequence
3679 so no '-' is required, except if the character is itself a '-' */
3680 if (IS_BASE64(ch) || ch == '-') {
3681 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003682 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003683 *out++ = (char) ch;
3684 }
3685 else {
3686 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003687 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003688 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003689 else { /* not in a shift sequence */
3690 if (ch == '+') {
3691 *out++ = '+';
3692 *out++ = '-';
3693 }
3694 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3695 *out++ = (char) ch;
3696 }
3697 else {
3698 *out++ = '+';
3699 inShift = 1;
3700 goto encode_char;
3701 }
3702 }
3703 continue;
3704encode_char:
3705#ifdef Py_UNICODE_WIDE
3706 if (ch >= 0x10000) {
3707 /* code first surrogate */
3708 base64bits += 16;
3709 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3710 while (base64bits >= 6) {
3711 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3712 base64bits -= 6;
3713 }
3714 /* prepare second surrogate */
3715 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3716 }
3717#endif
3718 base64bits += 16;
3719 base64buffer = (base64buffer << 16) | ch;
3720 while (base64bits >= 6) {
3721 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3722 base64bits -= 6;
3723 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003724 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003725 if (base64bits)
3726 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3727 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003728 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003729 if (_PyBytes_Resize(&v, out - start) < 0)
3730 return NULL;
3731 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003732}
3733
Antoine Pitrou244651a2009-05-04 18:56:13 +00003734#undef IS_BASE64
3735#undef FROM_BASE64
3736#undef TO_BASE64
3737#undef DECODE_DIRECT
3738#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740/* --- UTF-8 Codec -------------------------------------------------------- */
3741
Tim Petersced69f82003-09-16 20:30:58 +00003742static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003744 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3745 illegal prefix. See RFC 3629 for details */
3746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3758 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3759 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3760 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3761 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762};
3763
Alexander Belopolsky40018472011-02-26 01:02:56 +00003764PyObject *
3765PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003766 Py_ssize_t size,
3767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
Walter Dörwald69652032004-09-07 20:24:22 +00003769 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3770}
3771
Antoine Pitrouab868312009-01-10 15:40:25 +00003772/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3773#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3774
3775/* Mask to quickly check whether a C 'long' contains a
3776 non-ASCII, UTF8-encoded char. */
3777#if (SIZEOF_LONG == 8)
3778# define ASCII_CHAR_MASK 0x8080808080808080L
3779#elif (SIZEOF_LONG == 4)
3780# define ASCII_CHAR_MASK 0x80808080L
3781#else
3782# error C 'long' size should be either 4 or 8!
3783#endif
3784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785/* Scans a UTF-8 string and returns the maximum character to be expected,
3786 the size of the decoded unicode string and if any major errors were
3787 encountered.
3788
3789 This function does check basic UTF-8 sanity, it does however NOT CHECK
3790 if the string contains surrogates, and if all continuation bytes are
3791 within the correct ranges, these checks are performed in
3792 PyUnicode_DecodeUTF8Stateful.
3793
3794 If it sets has_errors to 1, it means the value of unicode_size and max_char
3795 will be bogus and you should not rely on useful information in them.
3796 */
3797static Py_UCS4
3798utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3799 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3800 int *has_errors)
3801{
3802 Py_ssize_t n;
3803 Py_ssize_t char_count = 0;
3804 Py_UCS4 max_char = 127, new_max;
3805 Py_UCS4 upper_bound;
3806 const unsigned char *p = (const unsigned char *)s;
3807 const unsigned char *end = p + string_size;
3808 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3809 int err = 0;
3810
3811 for (; p < end && !err; ++p, ++char_count) {
3812 /* Only check value if it's not a ASCII char... */
3813 if (*p < 0x80) {
3814 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3815 an explanation. */
3816 if (!((size_t) p & LONG_PTR_MASK)) {
3817 /* Help register allocation */
3818 register const unsigned char *_p = p;
3819 while (_p < aligned_end) {
3820 unsigned long value = *(unsigned long *) _p;
3821 if (value & ASCII_CHAR_MASK)
3822 break;
3823 _p += SIZEOF_LONG;
3824 char_count += SIZEOF_LONG;
3825 }
3826 p = _p;
3827 if (p == end)
3828 break;
3829 }
3830 }
3831 if (*p >= 0x80) {
3832 n = utf8_code_length[*p];
3833 new_max = max_char;
3834 switch (n) {
3835 /* invalid start byte */
3836 case 0:
3837 err = 1;
3838 break;
3839 case 2:
3840 /* Code points between 0x00FF and 0x07FF inclusive.
3841 Approximate the upper bound of the code point,
3842 if this flips over 255 we can be sure it will be more
3843 than 255 and the string will need 2 bytes per code coint,
3844 if it stays under or equal to 255, we can be sure 1 byte
3845 is enough.
3846 ((*p & 0b00011111) << 6) | 0b00111111 */
3847 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3848 if (max_char < upper_bound)
3849 new_max = upper_bound;
3850 /* Ensure we track at least that we left ASCII space. */
3851 if (new_max < 128)
3852 new_max = 128;
3853 break;
3854 case 3:
3855 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3856 always > 255 and <= 65535 and will always need 2 bytes. */
3857 if (max_char < 65535)
3858 new_max = 65535;
3859 break;
3860 case 4:
3861 /* Code point will be above 0xFFFF for sure in this case. */
3862 new_max = 65537;
3863 break;
3864 /* Internal error, this should be caught by the first if */
3865 case 1:
3866 default:
3867 assert(0 && "Impossible case in utf8_max_char_and_size");
3868 err = 1;
3869 }
3870 /* Instead of number of overall bytes for this code point,
3871 n containts the number of following bytes: */
3872 --n;
3873 /* Check if the follow up chars are all valid continuation bytes */
3874 if (n >= 1) {
3875 const unsigned char *cont;
3876 if ((p + n) >= end) {
3877 if (consumed == 0)
3878 /* incomplete data, non-incremental decoding */
3879 err = 1;
3880 break;
3881 }
3882 for (cont = p + 1; cont < (p + n); ++cont) {
3883 if ((*cont & 0xc0) != 0x80) {
3884 err = 1;
3885 break;
3886 }
3887 }
3888 p += n;
3889 }
3890 else
3891 err = 1;
3892 max_char = new_max;
3893 }
3894 }
3895
3896 if (unicode_size)
3897 *unicode_size = char_count;
3898 if (has_errors)
3899 *has_errors = err;
3900 return max_char;
3901}
3902
3903/* Similar to PyUnicode_WRITE but can also write into wstr field
3904 of the legacy unicode representation */
3905#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3906 do { \
3907 const int k_ = (kind); \
3908 if (k_ == PyUnicode_WCHAR_KIND) \
3909 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3910 else if (k_ == PyUnicode_1BYTE_KIND) \
3911 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3912 else if (k_ == PyUnicode_2BYTE_KIND) \
3913 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3914 else \
3915 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3916 } while (0)
3917
Alexander Belopolsky40018472011-02-26 01:02:56 +00003918PyObject *
3919PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 Py_ssize_t size,
3921 const char *errors,
3922 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003923{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003926 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003927 Py_ssize_t startinpos;
3928 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003929 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003931 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 PyObject *errorHandler = NULL;
3933 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 Py_UCS4 maxchar = 0;
3935 Py_ssize_t unicode_size;
3936 Py_ssize_t i;
3937 int kind;
3938 void *data;
3939 int has_errors;
3940 Py_UNICODE *error_outptr;
3941#if SIZEOF_WCHAR_T == 2
3942 Py_ssize_t wchar_offset = 0;
3943#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Walter Dörwald69652032004-09-07 20:24:22 +00003945 if (size == 0) {
3946 if (consumed)
3947 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3951 consumed, &has_errors);
3952 if (has_errors) {
3953 unicode = _PyUnicode_New(size);
3954 if (!unicode)
3955 return NULL;
3956 kind = PyUnicode_WCHAR_KIND;
3957 data = PyUnicode_AS_UNICODE(unicode);
3958 assert(data != NULL);
3959 }
3960 else {
3961 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3962 if (!unicode)
3963 return NULL;
3964 /* When the string is ASCII only, just use memcpy and return.
3965 unicode_size may be != size if there is an incomplete UTF-8
3966 sequence at the end of the ASCII block. */
3967 if (maxchar < 128 && size == unicode_size) {
3968 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3969 return (PyObject *)unicode;
3970 }
3971 kind = PyUnicode_KIND(unicode);
3972 data = PyUnicode_DATA(unicode);
3973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003977 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978
3979 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003980 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981
3982 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003983 /* Fast path for runs of ASCII characters. Given that common UTF-8
3984 input will consist of an overwhelming majority of ASCII
3985 characters, we try to optimize for this case by checking
3986 as many characters as a C 'long' can contain.
3987 First, check if we can do an aligned read, as most CPUs have
3988 a penalty for unaligned reads.
3989 */
3990 if (!((size_t) s & LONG_PTR_MASK)) {
3991 /* Help register allocation */
3992 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003994 while (_s < aligned_end) {
3995 /* Read a whole long at a time (either 4 or 8 bytes),
3996 and do a fast unrolled copy if it only contains ASCII
3997 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 unsigned long value = *(unsigned long *) _s;
3999 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004000 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4002 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4003 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4004 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004005#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4007 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4008 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4009 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004010#endif
4011 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004013 }
4014 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004016 if (s == e)
4017 break;
4018 ch = (unsigned char)*s;
4019 }
4020 }
4021
4022 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 s++;
4025 continue;
4026 }
4027
4028 n = utf8_code_length[ch];
4029
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004030 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 if (consumed)
4032 break;
4033 else {
4034 errmsg = "unexpected end of data";
4035 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004036 endinpos = startinpos+1;
4037 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4038 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 goto utf8Error;
4040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042
4043 switch (n) {
4044
4045 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004046 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 startinpos = s-starts;
4048 endinpos = startinpos+1;
4049 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050
4051 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004052 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 startinpos = s-starts;
4054 endinpos = startinpos+1;
4055 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
4057 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004058 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004059 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004061 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto utf8Error;
4063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004065 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 break;
4068
4069 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004070 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4071 will result in surrogates in range d800-dfff. Surrogates are
4072 not valid UTF-8 so they are rejected.
4073 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4074 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004075 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004076 (s[2] & 0xc0) != 0x80 ||
4077 ((unsigned char)s[0] == 0xE0 &&
4078 (unsigned char)s[1] < 0xA0) ||
4079 ((unsigned char)s[0] == 0xED &&
4080 (unsigned char)s[1] > 0x9F)) {
4081 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004083 endinpos = startinpos + 1;
4084
4085 /* if s[1] first two bits are 1 and 0, then the invalid
4086 continuation byte is s[2], so increment endinpos by 1,
4087 if not, s[1] is invalid and endinpos doesn't need to
4088 be incremented. */
4089 if ((s[1] & 0xC0) == 0x80)
4090 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 goto utf8Error;
4092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004094 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004096 break;
4097
4098 case 4:
4099 if ((s[1] & 0xc0) != 0x80 ||
4100 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004101 (s[3] & 0xc0) != 0x80 ||
4102 ((unsigned char)s[0] == 0xF0 &&
4103 (unsigned char)s[1] < 0x90) ||
4104 ((unsigned char)s[0] == 0xF4 &&
4105 (unsigned char)s[1] > 0x8F)) {
4106 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004108 endinpos = startinpos + 1;
4109 if ((s[1] & 0xC0) == 0x80) {
4110 endinpos++;
4111 if ((s[2] & 0xC0) == 0x80)
4112 endinpos++;
4113 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 goto utf8Error;
4115 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004116 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004117 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4118 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 /* If the string is flexible or we have native UCS-4, write
4121 directly.. */
4122 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4123 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 else {
4126 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 /* translate from 10000..10FFFF to 0..FFFF */
4129 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 /* high surrogate = top 10 bits added to D800 */
4132 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4133 (Py_UNICODE)(0xD800 + (ch >> 10)));
4134
4135 /* low surrogate = bottom 10 bits added to DC00 */
4136 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4137 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4138 }
4139#if SIZEOF_WCHAR_T == 2
4140 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004141#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 }
4144 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148 /* If this is not yet a resizable string, make it one.. */
4149 if (kind != PyUnicode_WCHAR_KIND) {
4150 const Py_UNICODE *u;
4151 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4152 if (!new_unicode)
4153 goto onError;
4154 u = PyUnicode_AsUnicode((PyObject *)unicode);
4155 if (!u)
4156 goto onError;
4157#if SIZEOF_WCHAR_T == 2
4158 i += wchar_offset;
4159#endif
4160 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4161 Py_DECREF(unicode);
4162 unicode = new_unicode;
4163 kind = 0;
4164 data = PyUnicode_AS_UNICODE(new_unicode);
4165 assert(data != NULL);
4166 }
4167 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 if (unicode_decode_call_errorhandler(
4169 errors, &errorHandler,
4170 "utf8", errmsg,
4171 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174 /* Update data because unicode_decode_call_errorhandler might have
4175 re-created or resized the unicode object. */
4176 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 /* Ensure the unicode_size calculation above was correct: */
4180 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4181
Walter Dörwald69652032004-09-07 20:24:22 +00004182 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 /* Adjust length and ready string when it contained errors and
4186 is of the old resizable kind. */
4187 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004188 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 goto onError;
4190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 Py_XDECREF(errorHandler);
4193 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004194 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 Py_DECREF(unicode);
4196 return NULL;
4197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 return (PyObject *)unicode;
4199
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 Py_XDECREF(errorHandler);
4202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 Py_DECREF(unicode);
4204 return NULL;
4205}
4206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004208
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004209#ifdef __APPLE__
4210
4211/* Simplified UTF-8 decoder using surrogateescape error handler,
4212 used to decode the command line arguments on Mac OS X. */
4213
4214wchar_t*
4215_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4216{
4217 int n;
4218 const char *e;
4219 wchar_t *unicode, *p;
4220
4221 /* Note: size will always be longer than the resulting Unicode
4222 character count */
4223 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4224 PyErr_NoMemory();
4225 return NULL;
4226 }
4227 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4228 if (!unicode)
4229 return NULL;
4230
4231 /* Unpack UTF-8 encoded data */
4232 p = unicode;
4233 e = s + size;
4234 while (s < e) {
4235 Py_UCS4 ch = (unsigned char)*s;
4236
4237 if (ch < 0x80) {
4238 *p++ = (wchar_t)ch;
4239 s++;
4240 continue;
4241 }
4242
4243 n = utf8_code_length[ch];
4244 if (s + n > e) {
4245 goto surrogateescape;
4246 }
4247
4248 switch (n) {
4249 case 0:
4250 case 1:
4251 goto surrogateescape;
4252
4253 case 2:
4254 if ((s[1] & 0xc0) != 0x80)
4255 goto surrogateescape;
4256 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4257 assert ((ch > 0x007F) && (ch <= 0x07FF));
4258 *p++ = (wchar_t)ch;
4259 break;
4260
4261 case 3:
4262 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4263 will result in surrogates in range d800-dfff. Surrogates are
4264 not valid UTF-8 so they are rejected.
4265 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4266 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4267 if ((s[1] & 0xc0) != 0x80 ||
4268 (s[2] & 0xc0) != 0x80 ||
4269 ((unsigned char)s[0] == 0xE0 &&
4270 (unsigned char)s[1] < 0xA0) ||
4271 ((unsigned char)s[0] == 0xED &&
4272 (unsigned char)s[1] > 0x9F)) {
4273
4274 goto surrogateescape;
4275 }
4276 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4277 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004279 break;
4280
4281 case 4:
4282 if ((s[1] & 0xc0) != 0x80 ||
4283 (s[2] & 0xc0) != 0x80 ||
4284 (s[3] & 0xc0) != 0x80 ||
4285 ((unsigned char)s[0] == 0xF0 &&
4286 (unsigned char)s[1] < 0x90) ||
4287 ((unsigned char)s[0] == 0xF4 &&
4288 (unsigned char)s[1] > 0x8F)) {
4289 goto surrogateescape;
4290 }
4291 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4292 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4293 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4294
4295#if SIZEOF_WCHAR_T == 4
4296 *p++ = (wchar_t)ch;
4297#else
4298 /* compute and append the two surrogates: */
4299
4300 /* translate from 10000..10FFFF to 0..FFFF */
4301 ch -= 0x10000;
4302
4303 /* high surrogate = top 10 bits added to D800 */
4304 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4305
4306 /* low surrogate = bottom 10 bits added to DC00 */
4307 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4308#endif
4309 break;
4310 }
4311 s += n;
4312 continue;
4313
4314 surrogateescape:
4315 *p++ = 0xDC00 + ch;
4316 s++;
4317 }
4318 *p = L'\0';
4319 return unicode;
4320}
4321
4322#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004324/* Primary internal function which creates utf8 encoded bytes objects.
4325
4326 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004327 and allocate exactly as much space needed at the end. Else allocate the
4328 maximum possible needed (4 result bytes per Unicode character), and return
4329 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004330*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004331PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333{
Tim Peters602f7402002-04-27 18:03:26 +00004334#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004335
Guido van Rossum98297ee2007-11-06 21:34:58 +00004336 Py_ssize_t i; /* index into s of next input byte */
4337 PyObject *result; /* result string object */
4338 char *p; /* next free byte in output buffer */
4339 Py_ssize_t nallocated; /* number of result bytes allocated */
4340 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004341 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004342 PyObject *errorHandler = NULL;
4343 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 int kind;
4345 void *data;
4346 Py_ssize_t size;
4347 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4348#if SIZEOF_WCHAR_T == 2
4349 Py_ssize_t wchar_offset = 0;
4350#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 if (!PyUnicode_Check(unicode)) {
4353 PyErr_BadArgument();
4354 return NULL;
4355 }
4356
4357 if (PyUnicode_READY(unicode) == -1)
4358 return NULL;
4359
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004360 if (PyUnicode_UTF8(unicode))
4361 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4362 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363
4364 kind = PyUnicode_KIND(unicode);
4365 data = PyUnicode_DATA(unicode);
4366 size = PyUnicode_GET_LENGTH(unicode);
4367
Tim Peters602f7402002-04-27 18:03:26 +00004368 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369
Tim Peters602f7402002-04-27 18:03:26 +00004370 if (size <= MAX_SHORT_UNICHARS) {
4371 /* Write into the stack buffer; nallocated can't overflow.
4372 * At the end, we'll allocate exactly as much heap space as it
4373 * turns out we need.
4374 */
4375 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004376 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004377 p = stackbuf;
4378 }
4379 else {
4380 /* Overallocate on the heap, and give the excess back at the end. */
4381 nallocated = size * 4;
4382 if (nallocated / 4 != size) /* overflow! */
4383 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004384 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004385 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004386 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004387 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004388 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004389
Tim Peters602f7402002-04-27 18:03:26 +00004390 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004392
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004393 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004394 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004396
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004398 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004399 *p++ = (char)(0xc0 | (ch >> 6));
4400 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004401 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004402 Py_ssize_t newpos;
4403 PyObject *rep;
4404 Py_ssize_t repsize, k, startpos;
4405 startpos = i-1;
4406#if SIZEOF_WCHAR_T == 2
4407 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004408#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409 rep = unicode_encode_call_errorhandler(
4410 errors, &errorHandler, "utf-8", "surrogates not allowed",
4411 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4412 &exc, startpos, startpos+1, &newpos);
4413 if (!rep)
4414 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416 if (PyBytes_Check(rep))
4417 repsize = PyBytes_GET_SIZE(rep);
4418 else
4419 repsize = PyUnicode_GET_SIZE(rep);
4420
4421 if (repsize > 4) {
4422 Py_ssize_t offset;
4423
4424 if (result == NULL)
4425 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004426 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004427 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4430 /* integer overflow */
4431 PyErr_NoMemory();
4432 goto error;
4433 }
4434 nallocated += repsize - 4;
4435 if (result != NULL) {
4436 if (_PyBytes_Resize(&result, nallocated) < 0)
4437 goto error;
4438 } else {
4439 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004440 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 goto error;
4442 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4443 }
4444 p = PyBytes_AS_STRING(result) + offset;
4445 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004447 if (PyBytes_Check(rep)) {
4448 char *prep = PyBytes_AS_STRING(rep);
4449 for(k = repsize; k > 0; k--)
4450 *p++ = *prep++;
4451 } else /* rep is unicode */ {
4452 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4453 Py_UNICODE c;
4454
4455 for(k=0; k<repsize; k++) {
4456 c = prep[k];
4457 if (0x80 <= c) {
4458 raise_encode_exception(&exc, "utf-8",
4459 PyUnicode_AS_UNICODE(unicode),
4460 size, i-1, i,
4461 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004462 goto error;
4463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004465 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004468 } else if (ch < 0x10000) {
4469 *p++ = (char)(0xe0 | (ch >> 12));
4470 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4471 *p++ = (char)(0x80 | (ch & 0x3f));
4472 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004473 /* Encode UCS4 Unicode ordinals */
4474 *p++ = (char)(0xf0 | (ch >> 18));
4475 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4476 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4477 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478#if SIZEOF_WCHAR_T == 2
4479 wchar_offset++;
4480#endif
Tim Peters602f7402002-04-27 18:03:26 +00004481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004483
Guido van Rossum98297ee2007-11-06 21:34:58 +00004484 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004485 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004487 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004488 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004489 }
4490 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004491 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004492 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004493 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004494 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004496
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004497 Py_XDECREF(errorHandler);
4498 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004499 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004500 error:
4501 Py_XDECREF(errorHandler);
4502 Py_XDECREF(exc);
4503 Py_XDECREF(result);
4504 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004505
Tim Peters602f7402002-04-27 18:03:26 +00004506#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507}
4508
Alexander Belopolsky40018472011-02-26 01:02:56 +00004509PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004510PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4511 Py_ssize_t size,
4512 const char *errors)
4513{
4514 PyObject *v, *unicode;
4515
4516 unicode = PyUnicode_FromUnicode(s, size);
4517 if (unicode == NULL)
4518 return NULL;
4519 v = _PyUnicode_AsUTF8String(unicode, errors);
4520 Py_DECREF(unicode);
4521 return v;
4522}
4523
4524PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004525PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528}
4529
Walter Dörwald41980ca2007-08-16 21:55:45 +00004530/* --- UTF-32 Codec ------------------------------------------------------- */
4531
4532PyObject *
4533PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 Py_ssize_t size,
4535 const char *errors,
4536 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004537{
4538 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4539}
4540
4541PyObject *
4542PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 Py_ssize_t size,
4544 const char *errors,
4545 int *byteorder,
4546 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004547{
4548 const char *starts = s;
4549 Py_ssize_t startinpos;
4550 Py_ssize_t endinpos;
4551 Py_ssize_t outpos;
4552 PyUnicodeObject *unicode;
4553 Py_UNICODE *p;
4554#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004555 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004556 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004557#else
4558 const int pairs = 0;
4559#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004560 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004561 int bo = 0; /* assume native ordering by default */
4562 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004563 /* Offsets from q for retrieving bytes in the right order. */
4564#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4565 int iorder[] = {0, 1, 2, 3};
4566#else
4567 int iorder[] = {3, 2, 1, 0};
4568#endif
4569 PyObject *errorHandler = NULL;
4570 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004571
Walter Dörwald41980ca2007-08-16 21:55:45 +00004572 q = (unsigned char *)s;
4573 e = q + size;
4574
4575 if (byteorder)
4576 bo = *byteorder;
4577
4578 /* Check for BOM marks (U+FEFF) in the input and adjust current
4579 byte order setting accordingly. In native mode, the leading BOM
4580 mark is skipped, in all other modes, it is copied to the output
4581 stream as-is (giving a ZWNBSP character). */
4582 if (bo == 0) {
4583 if (size >= 4) {
4584 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004586#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 if (bom == 0x0000FEFF) {
4588 q += 4;
4589 bo = -1;
4590 }
4591 else if (bom == 0xFFFE0000) {
4592 q += 4;
4593 bo = 1;
4594 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004595#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 if (bom == 0x0000FEFF) {
4597 q += 4;
4598 bo = 1;
4599 }
4600 else if (bom == 0xFFFE0000) {
4601 q += 4;
4602 bo = -1;
4603 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004604#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004606 }
4607
4608 if (bo == -1) {
4609 /* force LE */
4610 iorder[0] = 0;
4611 iorder[1] = 1;
4612 iorder[2] = 2;
4613 iorder[3] = 3;
4614 }
4615 else if (bo == 1) {
4616 /* force BE */
4617 iorder[0] = 3;
4618 iorder[1] = 2;
4619 iorder[2] = 1;
4620 iorder[3] = 0;
4621 }
4622
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004623 /* On narrow builds we split characters outside the BMP into two
4624 codepoints => count how much extra space we need. */
4625#ifndef Py_UNICODE_WIDE
4626 for (qq = q; qq < e; qq += 4)
4627 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4628 pairs++;
4629#endif
4630
4631 /* This might be one to much, because of a BOM */
4632 unicode = _PyUnicode_New((size+3)/4+pairs);
4633 if (!unicode)
4634 return NULL;
4635 if (size == 0)
4636 return (PyObject *)unicode;
4637
4638 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004640
Walter Dörwald41980ca2007-08-16 21:55:45 +00004641 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 Py_UCS4 ch;
4643 /* remaining bytes at the end? (size should be divisible by 4) */
4644 if (e-q<4) {
4645 if (consumed)
4646 break;
4647 errmsg = "truncated data";
4648 startinpos = ((const char *)q)-starts;
4649 endinpos = ((const char *)e)-starts;
4650 goto utf32Error;
4651 /* The remaining input chars are ignored if the callback
4652 chooses to skip the input */
4653 }
4654 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4655 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004656
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 if (ch >= 0x110000)
4658 {
4659 errmsg = "codepoint not in range(0x110000)";
4660 startinpos = ((const char *)q)-starts;
4661 endinpos = startinpos+4;
4662 goto utf32Error;
4663 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004664#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 if (ch >= 0x10000)
4666 {
4667 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4668 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4669 }
4670 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004671#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 *p++ = ch;
4673 q += 4;
4674 continue;
4675 utf32Error:
4676 outpos = p-PyUnicode_AS_UNICODE(unicode);
4677 if (unicode_decode_call_errorhandler(
4678 errors, &errorHandler,
4679 "utf32", errmsg,
4680 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4681 &unicode, &outpos, &p))
4682 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004683 }
4684
4685 if (byteorder)
4686 *byteorder = bo;
4687
4688 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004690
4691 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004692 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004693 goto onError;
4694
4695 Py_XDECREF(errorHandler);
4696 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004697 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004698 Py_DECREF(unicode);
4699 return NULL;
4700 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004701 return (PyObject *)unicode;
4702
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004704 Py_DECREF(unicode);
4705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
4707 return NULL;
4708}
4709
4710PyObject *
4711PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 Py_ssize_t size,
4713 const char *errors,
4714 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004715{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004716 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004717 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004718 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004719#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004720 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004721#else
4722 const int pairs = 0;
4723#endif
4724 /* Offsets from p for storing byte pairs in the right order. */
4725#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4726 int iorder[] = {0, 1, 2, 3};
4727#else
4728 int iorder[] = {3, 2, 1, 0};
4729#endif
4730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731#define STORECHAR(CH) \
4732 do { \
4733 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4734 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4735 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4736 p[iorder[0]] = (CH) & 0xff; \
4737 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004738 } while(0)
4739
4740 /* In narrow builds we can output surrogate pairs as one codepoint,
4741 so we need less space. */
4742#ifndef Py_UNICODE_WIDE
4743 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4745 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4746 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004747#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004748 nsize = (size - pairs + (byteorder == 0));
4749 bytesize = nsize * 4;
4750 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004752 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004753 if (v == NULL)
4754 return NULL;
4755
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004756 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004759 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004761
4762 if (byteorder == -1) {
4763 /* force LE */
4764 iorder[0] = 0;
4765 iorder[1] = 1;
4766 iorder[2] = 2;
4767 iorder[3] = 3;
4768 }
4769 else if (byteorder == 1) {
4770 /* force BE */
4771 iorder[0] = 3;
4772 iorder[1] = 2;
4773 iorder[2] = 1;
4774 iorder[3] = 0;
4775 }
4776
4777 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4781 Py_UCS4 ch2 = *s;
4782 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4783 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4784 s++;
4785 size--;
4786 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004787 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788#endif
4789 STORECHAR(ch);
4790 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004791
4792 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794#undef STORECHAR
4795}
4796
Alexander Belopolsky40018472011-02-26 01:02:56 +00004797PyObject *
4798PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004799{
4800 if (!PyUnicode_Check(unicode)) {
4801 PyErr_BadArgument();
4802 return NULL;
4803 }
4804 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 PyUnicode_GET_SIZE(unicode),
4806 NULL,
4807 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808}
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810/* --- UTF-16 Codec ------------------------------------------------------- */
4811
Tim Peters772747b2001-08-09 22:21:55 +00004812PyObject *
4813PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 Py_ssize_t size,
4815 const char *errors,
4816 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817{
Walter Dörwald69652032004-09-07 20:24:22 +00004818 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4819}
4820
Antoine Pitrouab868312009-01-10 15:40:25 +00004821/* Two masks for fast checking of whether a C 'long' may contain
4822 UTF16-encoded surrogate characters. This is an efficient heuristic,
4823 assuming that non-surrogate characters with a code point >= 0x8000 are
4824 rare in most input.
4825 FAST_CHAR_MASK is used when the input is in native byte ordering,
4826 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004827*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004828#if (SIZEOF_LONG == 8)
4829# define FAST_CHAR_MASK 0x8000800080008000L
4830# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4831#elif (SIZEOF_LONG == 4)
4832# define FAST_CHAR_MASK 0x80008000L
4833# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4834#else
4835# error C 'long' size should be either 4 or 8!
4836#endif
4837
Walter Dörwald69652032004-09-07 20:24:22 +00004838PyObject *
4839PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 Py_ssize_t size,
4841 const char *errors,
4842 int *byteorder,
4843 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004846 Py_ssize_t startinpos;
4847 Py_ssize_t endinpos;
4848 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 PyUnicodeObject *unicode;
4850 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004851 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004852 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004853 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004854 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004855 /* Offsets from q for retrieving byte pairs in the right order. */
4856#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4857 int ihi = 1, ilo = 0;
4858#else
4859 int ihi = 0, ilo = 1;
4860#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 PyObject *errorHandler = NULL;
4862 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863
4864 /* Note: size will always be longer than the resulting Unicode
4865 character count */
4866 unicode = _PyUnicode_New(size);
4867 if (!unicode)
4868 return NULL;
4869 if (size == 0)
4870 return (PyObject *)unicode;
4871
4872 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004874 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004875 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876
4877 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004878 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004880 /* Check for BOM marks (U+FEFF) in the input and adjust current
4881 byte order setting accordingly. In native mode, the leading BOM
4882 mark is skipped, in all other modes, it is copied to the output
4883 stream as-is (giving a ZWNBSP character). */
4884 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004885 if (size >= 2) {
4886 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (bom == 0xFEFF) {
4889 q += 2;
4890 bo = -1;
4891 }
4892 else if (bom == 0xFFFE) {
4893 q += 2;
4894 bo = 1;
4895 }
Tim Petersced69f82003-09-16 20:30:58 +00004896#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (bom == 0xFEFF) {
4898 q += 2;
4899 bo = 1;
4900 }
4901 else if (bom == 0xFFFE) {
4902 q += 2;
4903 bo = -1;
4904 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Tim Peters772747b2001-08-09 22:21:55 +00004909 if (bo == -1) {
4910 /* force LE */
4911 ihi = 1;
4912 ilo = 0;
4913 }
4914 else if (bo == 1) {
4915 /* force BE */
4916 ihi = 0;
4917 ilo = 1;
4918 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004919#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4920 native_ordering = ilo < ihi;
4921#else
4922 native_ordering = ilo > ihi;
4923#endif
Tim Peters772747b2001-08-09 22:21:55 +00004924
Antoine Pitrouab868312009-01-10 15:40:25 +00004925 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004926 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004928 /* First check for possible aligned read of a C 'long'. Unaligned
4929 reads are more expensive, better to defer to another iteration. */
4930 if (!((size_t) q & LONG_PTR_MASK)) {
4931 /* Fast path for runs of non-surrogate chars. */
4932 register const unsigned char *_q = q;
4933 Py_UNICODE *_p = p;
4934 if (native_ordering) {
4935 /* Native ordering is simple: as long as the input cannot
4936 possibly contain a surrogate char, do an unrolled copy
4937 of several 16-bit code points to the target object.
4938 The non-surrogate check is done on several input bytes
4939 at a time (as many as a C 'long' can contain). */
4940 while (_q < aligned_end) {
4941 unsigned long data = * (unsigned long *) _q;
4942 if (data & FAST_CHAR_MASK)
4943 break;
4944 _p[0] = ((unsigned short *) _q)[0];
4945 _p[1] = ((unsigned short *) _q)[1];
4946#if (SIZEOF_LONG == 8)
4947 _p[2] = ((unsigned short *) _q)[2];
4948 _p[3] = ((unsigned short *) _q)[3];
4949#endif
4950 _q += SIZEOF_LONG;
4951 _p += SIZEOF_LONG / 2;
4952 }
4953 }
4954 else {
4955 /* Byteswapped ordering is similar, but we must decompose
4956 the copy bytewise, and take care of zero'ing out the
4957 upper bytes if the target object is in 32-bit units
4958 (that is, in UCS-4 builds). */
4959 while (_q < aligned_end) {
4960 unsigned long data = * (unsigned long *) _q;
4961 if (data & SWAPPED_FAST_CHAR_MASK)
4962 break;
4963 /* Zero upper bytes in UCS-4 builds */
4964#if (Py_UNICODE_SIZE > 2)
4965 _p[0] = 0;
4966 _p[1] = 0;
4967#if (SIZEOF_LONG == 8)
4968 _p[2] = 0;
4969 _p[3] = 0;
4970#endif
4971#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004972 /* Issue #4916; UCS-4 builds on big endian machines must
4973 fill the two last bytes of each 4-byte unit. */
4974#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4975# define OFF 2
4976#else
4977# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004978#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004979 ((unsigned char *) _p)[OFF + 1] = _q[0];
4980 ((unsigned char *) _p)[OFF + 0] = _q[1];
4981 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4982 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4983#if (SIZEOF_LONG == 8)
4984 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4985 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4986 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4987 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4988#endif
4989#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004990 _q += SIZEOF_LONG;
4991 _p += SIZEOF_LONG / 2;
4992 }
4993 }
4994 p = _p;
4995 q = _q;
4996 if (q >= e)
4997 break;
4998 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000
Benjamin Peterson14339b62009-01-31 16:36:08 +00005001 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005002
5003 if (ch < 0xD800 || ch > 0xDFFF) {
5004 *p++ = ch;
5005 continue;
5006 }
5007
5008 /* UTF-16 code pair: */
5009 if (q > e) {
5010 errmsg = "unexpected end of data";
5011 startinpos = (((const char *)q) - 2) - starts;
5012 endinpos = ((const char *)e) + 1 - starts;
5013 goto utf16Error;
5014 }
5015 if (0xD800 <= ch && ch <= 0xDBFF) {
5016 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5017 q += 2;
5018 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005019#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 *p++ = ch;
5021 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005022#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005024#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 continue;
5026 }
5027 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005028 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 startinpos = (((const char *)q)-4)-starts;
5030 endinpos = startinpos+2;
5031 goto utf16Error;
5032 }
5033
Benjamin Peterson14339b62009-01-31 16:36:08 +00005034 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 errmsg = "illegal encoding";
5036 startinpos = (((const char *)q)-2)-starts;
5037 endinpos = startinpos+2;
5038 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005039
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 utf16Error:
5041 outpos = p - PyUnicode_AS_UNICODE(unicode);
5042 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005043 errors,
5044 &errorHandler,
5045 "utf16", errmsg,
5046 &starts,
5047 (const char **)&e,
5048 &startinpos,
5049 &endinpos,
5050 &exc,
5051 (const char **)&q,
5052 &unicode,
5053 &outpos,
5054 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005057 /* remaining byte at the end? (size should be even) */
5058 if (e == q) {
5059 if (!consumed) {
5060 errmsg = "truncated data";
5061 startinpos = ((const char *)q) - starts;
5062 endinpos = ((const char *)e) + 1 - starts;
5063 outpos = p - PyUnicode_AS_UNICODE(unicode);
5064 if (unicode_decode_call_errorhandler(
5065 errors,
5066 &errorHandler,
5067 "utf16", errmsg,
5068 &starts,
5069 (const char **)&e,
5070 &startinpos,
5071 &endinpos,
5072 &exc,
5073 (const char **)&q,
5074 &unicode,
5075 &outpos,
5076 &p))
5077 goto onError;
5078 /* The remaining input chars are ignored if the callback
5079 chooses to skip the input */
5080 }
5081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
5083 if (byteorder)
5084 *byteorder = bo;
5085
Walter Dörwald69652032004-09-07 20:24:22 +00005086 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005090 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 goto onError;
5092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 Py_XDECREF(errorHandler);
5094 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005095 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005096 Py_DECREF(unicode);
5097 return NULL;
5098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 return (PyObject *)unicode;
5100
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 Py_XDECREF(errorHandler);
5104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 return NULL;
5106}
5107
Antoine Pitrouab868312009-01-10 15:40:25 +00005108#undef FAST_CHAR_MASK
5109#undef SWAPPED_FAST_CHAR_MASK
5110
Tim Peters772747b2001-08-09 22:21:55 +00005111PyObject *
5112PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 Py_ssize_t size,
5114 const char *errors,
5115 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005118 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005119 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005120#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005121 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005122#else
5123 const int pairs = 0;
5124#endif
Tim Peters772747b2001-08-09 22:21:55 +00005125 /* Offsets from p for storing byte pairs in the right order. */
5126#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5127 int ihi = 1, ilo = 0;
5128#else
5129 int ihi = 0, ilo = 1;
5130#endif
5131
Benjamin Peterson29060642009-01-31 22:14:21 +00005132#define STORECHAR(CH) \
5133 do { \
5134 p[ihi] = ((CH) >> 8) & 0xff; \
5135 p[ilo] = (CH) & 0xff; \
5136 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005137 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005139#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005140 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 if (s[i] >= 0x10000)
5142 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005143#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005144 /* 2 * (size + pairs + (byteorder == 0)) */
5145 if (size > PY_SSIZE_T_MAX ||
5146 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005148 nsize = size + pairs + (byteorder == 0);
5149 bytesize = nsize * 2;
5150 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005152 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 if (v == NULL)
5154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005156 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005159 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005160 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005161
5162 if (byteorder == -1) {
5163 /* force LE */
5164 ihi = 1;
5165 ilo = 0;
5166 }
5167 else if (byteorder == 1) {
5168 /* force BE */
5169 ihi = 0;
5170 ilo = 1;
5171 }
5172
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005173 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 Py_UNICODE ch = *s++;
5175 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005176#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 if (ch >= 0x10000) {
5178 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5179 ch = 0xD800 | ((ch-0x10000) >> 10);
5180 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005181#endif
Tim Peters772747b2001-08-09 22:21:55 +00005182 STORECHAR(ch);
5183 if (ch2)
5184 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005185 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005186
5187 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005188 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005189#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
Alexander Belopolsky40018472011-02-26 01:02:56 +00005192PyObject *
5193PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
5195 if (!PyUnicode_Check(unicode)) {
5196 PyErr_BadArgument();
5197 return NULL;
5198 }
5199 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 PyUnicode_GET_SIZE(unicode),
5201 NULL,
5202 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203}
5204
5205/* --- Unicode Escape Codec ----------------------------------------------- */
5206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005207/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5208 if all the escapes in the string make it still a valid ASCII string.
5209 Returns -1 if any escapes were found which cause the string to
5210 pop out of ASCII range. Otherwise returns the length of the
5211 required buffer to hold the string.
5212 */
5213Py_ssize_t
5214length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5215{
5216 const unsigned char *p = (const unsigned char *)s;
5217 const unsigned char *end = p + size;
5218 Py_ssize_t length = 0;
5219
5220 if (size < 0)
5221 return -1;
5222
5223 for (; p < end; ++p) {
5224 if (*p > 127) {
5225 /* Non-ASCII */
5226 return -1;
5227 }
5228 else if (*p != '\\') {
5229 /* Normal character */
5230 ++length;
5231 }
5232 else {
5233 /* Backslash-escape, check next char */
5234 ++p;
5235 /* Escape sequence reaches till end of string or
5236 non-ASCII follow-up. */
5237 if (p >= end || *p > 127)
5238 return -1;
5239 switch (*p) {
5240 case '\n':
5241 /* backslash + \n result in zero characters */
5242 break;
5243 case '\\': case '\'': case '\"':
5244 case 'b': case 'f': case 't':
5245 case 'n': case 'r': case 'v': case 'a':
5246 ++length;
5247 break;
5248 case '0': case '1': case '2': case '3':
5249 case '4': case '5': case '6': case '7':
5250 case 'x': case 'u': case 'U': case 'N':
5251 /* these do not guarantee ASCII characters */
5252 return -1;
5253 default:
5254 /* count the backslash + the other character */
5255 length += 2;
5256 }
5257 }
5258 }
5259 return length;
5260}
5261
5262/* Similar to PyUnicode_WRITE but either write into wstr field
5263 or treat string as ASCII. */
5264#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5265 do { \
5266 if ((kind) != PyUnicode_WCHAR_KIND) \
5267 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5268 else \
5269 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5270 } while (0)
5271
5272#define WRITE_WSTR(buf, index, value) \
5273 assert(kind == PyUnicode_WCHAR_KIND), \
5274 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5275
5276
Fredrik Lundh06d12682001-01-24 07:59:11 +00005277static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005278
Alexander Belopolsky40018472011-02-26 01:02:56 +00005279PyObject *
5280PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005281 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 Py_ssize_t startinpos;
5286 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005287 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005291 char* message;
5292 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293 PyObject *errorHandler = NULL;
5294 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005295 Py_ssize_t ascii_length;
5296 Py_ssize_t i;
5297 int kind;
5298 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005300 ascii_length = length_of_escaped_ascii_string(s, size);
5301
5302 /* After length_of_escaped_ascii_string() there are two alternatives,
5303 either the string is pure ASCII with named escapes like \n, etc.
5304 and we determined it's exact size (common case)
5305 or it contains \x, \u, ... escape sequences. then we create a
5306 legacy wchar string and resize it at the end of this function. */
5307 if (ascii_length >= 0) {
5308 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5309 if (!v)
5310 goto onError;
5311 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5312 kind = PyUnicode_1BYTE_KIND;
5313 data = PyUnicode_DATA(v);
5314 }
5315 else {
5316 /* Escaped strings will always be longer than the resulting
5317 Unicode string, so we start with size here and then reduce the
5318 length after conversion to the true value.
5319 (but if the error callback returns a long replacement string
5320 we'll have to allocate more space) */
5321 v = _PyUnicode_New(size);
5322 if (!v)
5323 goto onError;
5324 kind = PyUnicode_WCHAR_KIND;
5325 data = PyUnicode_AS_UNICODE(v);
5326 }
5327
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 if (size == 0)
5329 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005330 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 while (s < end) {
5334 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005335 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005338 if (kind == PyUnicode_WCHAR_KIND) {
5339 assert(i < _PyUnicode_WSTR_LENGTH(v));
5340 }
5341 else {
5342 /* The only case in which i == ascii_length is a backslash
5343 followed by a newline. */
5344 assert(i <= ascii_length);
5345 }
5346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 /* Non-escape characters are interpreted as Unicode ordinals */
5348 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005349 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 continue;
5351 }
5352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 /* \ - Escapes */
5355 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005356 c = *s++;
5357 if (s > end)
5358 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359
5360 if (kind == PyUnicode_WCHAR_KIND) {
5361 assert(i < _PyUnicode_WSTR_LENGTH(v));
5362 }
5363 else {
5364 /* The only case in which i == ascii_length is a backslash
5365 followed by a newline. */
5366 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5367 }
5368
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005369 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5374 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5375 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5376 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5377 /* FF */
5378 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5379 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5380 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5381 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5382 /* VT */
5383 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5384 /* BEL, not classic C */
5385 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 case '0': case '1': case '2': case '3':
5389 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005390 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005391 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005392 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005393 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005394 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 break;
5398
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 /* hex escapes */
5400 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005402 digits = 2;
5403 message = "truncated \\xXX escape";
5404 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005408 digits = 4;
5409 message = "truncated \\uXXXX escape";
5410 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005413 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005414 digits = 8;
5415 message = "truncated \\UXXXXXXXX escape";
5416 hexescape:
5417 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 if (s+digits>end) {
5420 endinpos = size;
5421 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 errors, &errorHandler,
5423 "unicodeescape", "end of string in escape sequence",
5424 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005425 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 goto nextByte;
5429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430 for (j = 0; j < digits; ++j) {
5431 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005432 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 endinpos = (s+j+1)-starts;
5434 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005435 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 errors, &errorHandler,
5437 "unicodeescape", message,
5438 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005440 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005443 }
5444 chr = (chr<<4) & ~0xF;
5445 if (c >= '0' && c <= '9')
5446 chr += c - '0';
5447 else if (c >= 'a' && c <= 'f')
5448 chr += 10 + c - 'a';
5449 else
5450 chr += 10 + c - 'A';
5451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005453 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 /* _decoding_error will have already written into the
5455 target buffer. */
5456 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005457 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005458 /* when we get here, chr is a 32-bit unicode character */
5459 if (chr <= 0xffff)
5460 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005462 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005463 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005464 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005465#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005467#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005468 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005469 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5470 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005471#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005472 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 errors, &errorHandler,
5477 "unicodeescape", "illegal Unicode character",
5478 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005480 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005482 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005483 break;
5484
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005486 case 'N':
5487 message = "malformed \\N character escape";
5488 if (ucnhash_CAPI == NULL) {
5489 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005490 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5491 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 if (ucnhash_CAPI == NULL)
5493 goto ucnhashError;
5494 }
5495 if (*s == '{') {
5496 const char *start = s+1;
5497 /* look for the closing brace */
5498 while (*s != '}' && s < end)
5499 s++;
5500 if (s > start && s < end && *s == '}') {
5501 /* found a name. look it up in the unicode database */
5502 message = "unknown Unicode character name";
5503 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5505 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005506 goto store;
5507 }
5508 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 errors, &errorHandler,
5513 "unicodeescape", message,
5514 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005515 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005516 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005518 break;
5519
5520 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005521 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523 message = "\\ at end of string";
5524 s--;
5525 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005527 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 errors, &errorHandler,
5529 "unicodeescape", message,
5530 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005531 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005532 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005534 }
5535 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5537 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005538 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005539 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 /* Ensure the length prediction worked in case of ASCII strings */
5545 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5546
Victor Stinnerfe226c02011-10-03 03:52:20 +02005547 if (kind == PyUnicode_WCHAR_KIND)
5548 {
5549 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5550 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005551 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005552 Py_XDECREF(errorHandler);
5553 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005554 if (_PyUnicode_READY_REPLACE(&v)) {
5555 Py_DECREF(v);
5556 return NULL;
5557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005559
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005561 PyErr_SetString(
5562 PyExc_UnicodeError,
5563 "\\N escapes not supported (can't load unicodedata module)"
5564 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005565 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 Py_XDECREF(errorHandler);
5567 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005568 return NULL;
5569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 Py_XDECREF(errorHandler);
5573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return NULL;
5575}
5576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577#undef WRITE_ASCII_OR_WSTR
5578#undef WRITE_WSTR
5579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580/* Return a Unicode-Escape string version of the Unicode object.
5581
5582 If quotes is true, the string is enclosed in u"" or u'' quotes as
5583 appropriate.
5584
5585*/
5586
Walter Dörwald79e913e2007-05-12 11:08:06 +00005587static const char *hexdigits = "0123456789abcdef";
5588
Alexander Belopolsky40018472011-02-26 01:02:56 +00005589PyObject *
5590PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005591 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005593 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005596#ifdef Py_UNICODE_WIDE
5597 const Py_ssize_t expandsize = 10;
5598#else
5599 const Py_ssize_t expandsize = 6;
5600#endif
5601
Thomas Wouters89f507f2006-12-13 04:49:30 +00005602 /* XXX(nnorwitz): rather than over-allocating, it would be
5603 better to choose a different scheme. Perhaps scan the
5604 first N-chars of the string and allocate based on that size.
5605 */
5606 /* Initial allocation is based on the longest-possible unichr
5607 escape.
5608
5609 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5610 unichr, so in this case it's the longest unichr escape. In
5611 narrow (UTF-16) builds this is five chars per source unichr
5612 since there are two unichrs in the surrogate pair, so in narrow
5613 (UTF-16) builds it's not the longest unichr escape.
5614
5615 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5616 so in the narrow (UTF-16) build case it's the longest unichr
5617 escape.
5618 */
5619
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005620 if (size == 0)
5621 return PyBytes_FromStringAndSize(NULL, 0);
5622
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005623 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005625
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005626 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 2
5628 + expandsize*size
5629 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 if (repr == NULL)
5631 return NULL;
5632
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005633 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 while (size-- > 0) {
5636 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005637
Walter Dörwald79e913e2007-05-12 11:08:06 +00005638 /* Escape backslashes */
5639 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 *p++ = '\\';
5641 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005642 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005643 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005644
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005645#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005646 /* Map 21-bit characters to '\U00xxxxxx' */
5647 else if (ch >= 0x10000) {
5648 *p++ = '\\';
5649 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005650 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5651 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5652 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5653 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5654 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5655 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5656 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5657 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005659 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005660#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5662 else if (ch >= 0xD800 && ch < 0xDC00) {
5663 Py_UNICODE ch2;
5664 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005665
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 ch2 = *s++;
5667 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005668 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5670 *p++ = '\\';
5671 *p++ = 'U';
5672 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5673 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5674 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5675 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5676 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5677 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5678 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5679 *p++ = hexdigits[ucs & 0x0000000F];
5680 continue;
5681 }
5682 /* Fall through: isolated surrogates are copied as-is */
5683 s--;
5684 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005685 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005686#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005689 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 *p++ = '\\';
5691 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005692 *p++ = hexdigits[(ch >> 12) & 0x000F];
5693 *p++ = hexdigits[(ch >> 8) & 0x000F];
5694 *p++ = hexdigits[(ch >> 4) & 0x000F];
5695 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005697
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005698 /* Map special whitespace to '\t', \n', '\r' */
5699 else if (ch == '\t') {
5700 *p++ = '\\';
5701 *p++ = 't';
5702 }
5703 else if (ch == '\n') {
5704 *p++ = '\\';
5705 *p++ = 'n';
5706 }
5707 else if (ch == '\r') {
5708 *p++ = '\\';
5709 *p++ = 'r';
5710 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005711
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005712 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005713 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005715 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005716 *p++ = hexdigits[(ch >> 4) & 0x000F];
5717 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005718 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005719
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 /* Copy everything else as-is */
5721 else
5722 *p++ = (char) ch;
5723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005725 assert(p - PyBytes_AS_STRING(repr) > 0);
5726 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5727 return NULL;
5728 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729}
5730
Alexander Belopolsky40018472011-02-26 01:02:56 +00005731PyObject *
5732PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005734 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 if (!PyUnicode_Check(unicode)) {
5736 PyErr_BadArgument();
5737 return NULL;
5738 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005739 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5740 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005741 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742}
5743
5744/* --- Raw Unicode Escape Codec ------------------------------------------- */
5745
Alexander Belopolsky40018472011-02-26 01:02:56 +00005746PyObject *
5747PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005748 Py_ssize_t size,
5749 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t startinpos;
5753 Py_ssize_t endinpos;
5754 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 const char *end;
5758 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 PyObject *errorHandler = NULL;
5760 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 /* Escaped strings will always be longer than the resulting
5763 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 length after conversion to the true value. (But decoding error
5765 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 v = _PyUnicode_New(size);
5767 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 end = s + size;
5773 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 unsigned char c;
5775 Py_UCS4 x;
5776 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005777 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 /* Non-escape characters are interpreted as Unicode ordinals */
5780 if (*s != '\\') {
5781 *p++ = (unsigned char)*s++;
5782 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 startinpos = s-starts;
5785
5786 /* \u-escapes are only interpreted iff the number of leading
5787 backslashes if odd */
5788 bs = s;
5789 for (;s < end;) {
5790 if (*s != '\\')
5791 break;
5792 *p++ = (unsigned char)*s++;
5793 }
5794 if (((s - bs) & 1) == 0 ||
5795 s >= end ||
5796 (*s != 'u' && *s != 'U')) {
5797 continue;
5798 }
5799 p--;
5800 count = *s=='u' ? 4 : 8;
5801 s++;
5802
5803 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5804 outpos = p-PyUnicode_AS_UNICODE(v);
5805 for (x = 0, i = 0; i < count; ++i, ++s) {
5806 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005807 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 endinpos = s-starts;
5809 if (unicode_decode_call_errorhandler(
5810 errors, &errorHandler,
5811 "rawunicodeescape", "truncated \\uXXXX",
5812 &starts, &end, &startinpos, &endinpos, &exc, &s,
5813 &v, &outpos, &p))
5814 goto onError;
5815 goto nextByte;
5816 }
5817 x = (x<<4) & ~0xF;
5818 if (c >= '0' && c <= '9')
5819 x += c - '0';
5820 else if (c >= 'a' && c <= 'f')
5821 x += 10 + c - 'a';
5822 else
5823 x += 10 + c - 'A';
5824 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005825 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 /* UCS-2 character */
5827 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005828 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 /* UCS-4 character. Either store directly, or as
5830 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005831#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005833#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 x -= 0x10000L;
5835 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5836 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005837#endif
5838 } else {
5839 endinpos = s-starts;
5840 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005841 if (unicode_decode_call_errorhandler(
5842 errors, &errorHandler,
5843 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 &starts, &end, &startinpos, &endinpos, &exc, &s,
5845 &v, &outpos, &p))
5846 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005847 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 nextByte:
5849 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005851 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005855 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005856 Py_DECREF(v);
5857 return NULL;
5858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005860
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 Py_XDECREF(errorHandler);
5864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 return NULL;
5866}
5867
Alexander Belopolsky40018472011-02-26 01:02:56 +00005868PyObject *
5869PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005870 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 char *p;
5874 char *q;
5875
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005876#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005877 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005878#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005879 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005880#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005881
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005882 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005884
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005885 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 if (repr == NULL)
5887 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005888 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005889 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005891 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 while (size-- > 0) {
5893 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005894#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 /* Map 32-bit characters to '\Uxxxxxxxx' */
5896 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005897 *p++ = '\\';
5898 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005899 *p++ = hexdigits[(ch >> 28) & 0xf];
5900 *p++ = hexdigits[(ch >> 24) & 0xf];
5901 *p++ = hexdigits[(ch >> 20) & 0xf];
5902 *p++ = hexdigits[(ch >> 16) & 0xf];
5903 *p++ = hexdigits[(ch >> 12) & 0xf];
5904 *p++ = hexdigits[(ch >> 8) & 0xf];
5905 *p++ = hexdigits[(ch >> 4) & 0xf];
5906 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005907 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005908 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005909#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5911 if (ch >= 0xD800 && ch < 0xDC00) {
5912 Py_UNICODE ch2;
5913 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 ch2 = *s++;
5916 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005917 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5919 *p++ = '\\';
5920 *p++ = 'U';
5921 *p++ = hexdigits[(ucs >> 28) & 0xf];
5922 *p++ = hexdigits[(ucs >> 24) & 0xf];
5923 *p++ = hexdigits[(ucs >> 20) & 0xf];
5924 *p++ = hexdigits[(ucs >> 16) & 0xf];
5925 *p++ = hexdigits[(ucs >> 12) & 0xf];
5926 *p++ = hexdigits[(ucs >> 8) & 0xf];
5927 *p++ = hexdigits[(ucs >> 4) & 0xf];
5928 *p++ = hexdigits[ucs & 0xf];
5929 continue;
5930 }
5931 /* Fall through: isolated surrogates are copied as-is */
5932 s--;
5933 size++;
5934 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005935#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 /* Map 16-bit characters to '\uxxxx' */
5937 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 *p++ = '\\';
5939 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005940 *p++ = hexdigits[(ch >> 12) & 0xf];
5941 *p++ = hexdigits[(ch >> 8) & 0xf];
5942 *p++ = hexdigits[(ch >> 4) & 0xf];
5943 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 /* Copy everything else as-is */
5946 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 *p++ = (char) ch;
5948 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005949 size = p - q;
5950
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005951 assert(size > 0);
5952 if (_PyBytes_Resize(&repr, size) < 0)
5953 return NULL;
5954 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955}
5956
Alexander Belopolsky40018472011-02-26 01:02:56 +00005957PyObject *
5958PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005960 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005962 PyErr_BadArgument();
5963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005965 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5966 PyUnicode_GET_SIZE(unicode));
5967
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005968 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969}
5970
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005971/* --- Unicode Internal Codec ------------------------------------------- */
5972
Alexander Belopolsky40018472011-02-26 01:02:56 +00005973PyObject *
5974_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005975 Py_ssize_t size,
5976 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005977{
5978 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005979 Py_ssize_t startinpos;
5980 Py_ssize_t endinpos;
5981 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 PyUnicodeObject *v;
5983 Py_UNICODE *p;
5984 const char *end;
5985 const char *reason;
5986 PyObject *errorHandler = NULL;
5987 PyObject *exc = NULL;
5988
Neal Norwitzd43069c2006-01-08 01:12:10 +00005989#ifdef Py_UNICODE_WIDE
5990 Py_UNICODE unimax = PyUnicode_GetMax();
5991#endif
5992
Thomas Wouters89f507f2006-12-13 04:49:30 +00005993 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005994 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5995 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005997 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5998 as string was created with the old API. */
5999 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006001 p = PyUnicode_AS_UNICODE(v);
6002 end = s + size;
6003
6004 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006005 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006006 /* We have to sanity check the raw data, otherwise doom looms for
6007 some malformed UCS-4 data. */
6008 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006009#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006010 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006011#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006012 end-s < Py_UNICODE_SIZE
6013 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006015 startinpos = s - starts;
6016 if (end-s < Py_UNICODE_SIZE) {
6017 endinpos = end-starts;
6018 reason = "truncated input";
6019 }
6020 else {
6021 endinpos = s - starts + Py_UNICODE_SIZE;
6022 reason = "illegal code point (> 0x10FFFF)";
6023 }
6024 outpos = p - PyUnicode_AS_UNICODE(v);
6025 if (unicode_decode_call_errorhandler(
6026 errors, &errorHandler,
6027 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006028 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006029 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030 goto onError;
6031 }
6032 }
6033 else {
6034 p++;
6035 s += Py_UNICODE_SIZE;
6036 }
6037 }
6038
Victor Stinnerfe226c02011-10-03 03:52:20 +02006039 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006040 goto onError;
6041 Py_XDECREF(errorHandler);
6042 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006043 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006044 Py_DECREF(v);
6045 return NULL;
6046 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 return (PyObject *)v;
6048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006050 Py_XDECREF(v);
6051 Py_XDECREF(errorHandler);
6052 Py_XDECREF(exc);
6053 return NULL;
6054}
6055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056/* --- Latin-1 Codec ------------------------------------------------------ */
6057
Alexander Belopolsky40018472011-02-26 01:02:56 +00006058PyObject *
6059PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006060 Py_ssize_t size,
6061 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006064 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065}
6066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006068static void
6069make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006070 const char *encoding,
6071 const Py_UNICODE *unicode, Py_ssize_t size,
6072 Py_ssize_t startpos, Py_ssize_t endpos,
6073 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 *exceptionObject = PyUnicodeEncodeError_Create(
6077 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
6079 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6081 goto onError;
6082 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6083 goto onError;
6084 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6085 goto onError;
6086 return;
6087 onError:
6088 Py_DECREF(*exceptionObject);
6089 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
6091}
6092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006094static void
6095raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 const char *encoding,
6097 const Py_UNICODE *unicode, Py_ssize_t size,
6098 Py_ssize_t startpos, Py_ssize_t endpos,
6099 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100{
6101 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105}
6106
6107/* error handling callback helper:
6108 build arguments, call the callback and check the arguments,
6109 put the result into newpos and return the replacement string, which
6110 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111static PyObject *
6112unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006113 PyObject **errorHandler,
6114 const char *encoding, const char *reason,
6115 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6116 Py_ssize_t startpos, Py_ssize_t endpos,
6117 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006119 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120
6121 PyObject *restuple;
6122 PyObject *resunicode;
6123
6124 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 }
6129
6130 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134
6135 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006140 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 Py_DECREF(restuple);
6142 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006144 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 &resunicode, newpos)) {
6146 Py_DECREF(restuple);
6147 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006149 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6150 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6151 Py_DECREF(restuple);
6152 return NULL;
6153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006156 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6158 Py_DECREF(restuple);
6159 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 Py_INCREF(resunicode);
6162 Py_DECREF(restuple);
6163 return resunicode;
6164}
6165
Alexander Belopolsky40018472011-02-26 01:02:56 +00006166static PyObject *
6167unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006168 Py_ssize_t size,
6169 const char *errors,
6170 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171{
6172 /* output object */
6173 PyObject *res;
6174 /* pointers to the beginning and end+1 of input */
6175 const Py_UNICODE *startp = p;
6176 const Py_UNICODE *endp = p + size;
6177 /* pointer to the beginning of the unencodable characters */
6178 /* const Py_UNICODE *badp = NULL; */
6179 /* pointer into the output */
6180 char *str;
6181 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006182 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006183 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6184 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 PyObject *errorHandler = NULL;
6186 PyObject *exc = NULL;
6187 /* the following variable is used for caching string comparisons
6188 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6189 int known_errorHandler = -1;
6190
6191 /* allocate enough for a simple encoding without
6192 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006193 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006194 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006197 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 ressize = size;
6200
6201 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* can we encode this? */
6205 if (c<limit) {
6206 /* no overflow check, because we know that the space is enough */
6207 *str++ = (char)c;
6208 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 else {
6211 Py_ssize_t unicodepos = p-startp;
6212 Py_ssize_t requiredsize;
6213 PyObject *repunicode;
6214 Py_ssize_t repsize;
6215 Py_ssize_t newpos;
6216 Py_ssize_t respos;
6217 Py_UNICODE *uni2;
6218 /* startpos for collecting unencodable chars */
6219 const Py_UNICODE *collstart = p;
6220 const Py_UNICODE *collend = p;
6221 /* find all unecodable characters */
6222 while ((collend < endp) && ((*collend)>=limit))
6223 ++collend;
6224 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6225 if (known_errorHandler==-1) {
6226 if ((errors==NULL) || (!strcmp(errors, "strict")))
6227 known_errorHandler = 1;
6228 else if (!strcmp(errors, "replace"))
6229 known_errorHandler = 2;
6230 else if (!strcmp(errors, "ignore"))
6231 known_errorHandler = 3;
6232 else if (!strcmp(errors, "xmlcharrefreplace"))
6233 known_errorHandler = 4;
6234 else
6235 known_errorHandler = 0;
6236 }
6237 switch (known_errorHandler) {
6238 case 1: /* strict */
6239 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6240 goto onError;
6241 case 2: /* replace */
6242 while (collstart++<collend)
6243 *str++ = '?'; /* fall through */
6244 case 3: /* ignore */
6245 p = collend;
6246 break;
6247 case 4: /* xmlcharrefreplace */
6248 respos = str - PyBytes_AS_STRING(res);
6249 /* determine replacement size (temporarily (mis)uses p) */
6250 for (p = collstart, repsize = 0; p < collend; ++p) {
6251 if (*p<10)
6252 repsize += 2+1+1;
6253 else if (*p<100)
6254 repsize += 2+2+1;
6255 else if (*p<1000)
6256 repsize += 2+3+1;
6257 else if (*p<10000)
6258 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006259#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 else
6261 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006262#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 else if (*p<100000)
6264 repsize += 2+5+1;
6265 else if (*p<1000000)
6266 repsize += 2+6+1;
6267 else
6268 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006269#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 }
6271 requiredsize = respos+repsize+(endp-collend);
6272 if (requiredsize > ressize) {
6273 if (requiredsize<2*ressize)
6274 requiredsize = 2*ressize;
6275 if (_PyBytes_Resize(&res, requiredsize))
6276 goto onError;
6277 str = PyBytes_AS_STRING(res) + respos;
6278 ressize = requiredsize;
6279 }
6280 /* generate replacement (temporarily (mis)uses p) */
6281 for (p = collstart; p < collend; ++p) {
6282 str += sprintf(str, "&#%d;", (int)*p);
6283 }
6284 p = collend;
6285 break;
6286 default:
6287 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6288 encoding, reason, startp, size, &exc,
6289 collstart-startp, collend-startp, &newpos);
6290 if (repunicode == NULL)
6291 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006292 if (PyBytes_Check(repunicode)) {
6293 /* Directly copy bytes result to output. */
6294 repsize = PyBytes_Size(repunicode);
6295 if (repsize > 1) {
6296 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006297 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006298 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6299 Py_DECREF(repunicode);
6300 goto onError;
6301 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006302 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006303 ressize += repsize-1;
6304 }
6305 memcpy(str, PyBytes_AsString(repunicode), repsize);
6306 str += repsize;
6307 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006308 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006309 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006310 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 /* need more space? (at least enough for what we
6312 have+the replacement+the rest of the string, so
6313 we won't have to check space for encodable characters) */
6314 respos = str - PyBytes_AS_STRING(res);
6315 repsize = PyUnicode_GET_SIZE(repunicode);
6316 requiredsize = respos+repsize+(endp-collend);
6317 if (requiredsize > ressize) {
6318 if (requiredsize<2*ressize)
6319 requiredsize = 2*ressize;
6320 if (_PyBytes_Resize(&res, requiredsize)) {
6321 Py_DECREF(repunicode);
6322 goto onError;
6323 }
6324 str = PyBytes_AS_STRING(res) + respos;
6325 ressize = requiredsize;
6326 }
6327 /* check if there is anything unencodable in the replacement
6328 and copy it to the output */
6329 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6330 c = *uni2;
6331 if (c >= limit) {
6332 raise_encode_exception(&exc, encoding, startp, size,
6333 unicodepos, unicodepos+1, reason);
6334 Py_DECREF(repunicode);
6335 goto onError;
6336 }
6337 *str = (char)c;
6338 }
6339 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006340 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006342 }
6343 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006344 /* Resize if we allocated to much */
6345 size = str - PyBytes_AS_STRING(res);
6346 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006347 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 if (_PyBytes_Resize(&res, size) < 0)
6349 goto onError;
6350 }
6351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 Py_XDECREF(errorHandler);
6353 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006354 return res;
6355
6356 onError:
6357 Py_XDECREF(res);
6358 Py_XDECREF(errorHandler);
6359 Py_XDECREF(exc);
6360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361}
6362
Alexander Belopolsky40018472011-02-26 01:02:56 +00006363PyObject *
6364PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006365 Py_ssize_t size,
6366 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369}
6370
Alexander Belopolsky40018472011-02-26 01:02:56 +00006371PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006372_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373{
6374 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 PyErr_BadArgument();
6376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006378 if (PyUnicode_READY(unicode) == -1)
6379 return NULL;
6380 /* Fast path: if it is a one-byte string, construct
6381 bytes object directly. */
6382 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6383 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6384 PyUnicode_GET_LENGTH(unicode));
6385 /* Non-Latin-1 characters present. Defer to above function to
6386 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006389 errors);
6390}
6391
6392PyObject*
6393PyUnicode_AsLatin1String(PyObject *unicode)
6394{
6395 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396}
6397
6398/* --- 7-bit ASCII Codec -------------------------------------------------- */
6399
Alexander Belopolsky40018472011-02-26 01:02:56 +00006400PyObject *
6401PyUnicode_DecodeASCII(const char *s,
6402 Py_ssize_t size,
6403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 PyUnicodeObject *v;
6407 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t startinpos;
6409 Py_ssize_t endinpos;
6410 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006412 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 PyObject *errorHandler = NULL;
6414 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006415 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418 if (size == 1 && *(unsigned char*)s < 128)
6419 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6420
6421 /* Fast path. Assume the input actually *is* ASCII, and allocate
6422 a single-block Unicode object with that assumption. If there is
6423 an error, drop the object and start over. */
6424 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6425 if (v == NULL)
6426 goto onError;
6427 d = PyUnicode_1BYTE_DATA(v);
6428 for (i = 0; i < size; i++) {
6429 unsigned char ch = ((unsigned char*)s)[i];
6430 if (ch < 128)
6431 d[i] = ch;
6432 else
6433 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006435 if (i == size)
6436 return (PyObject*)v;
6437 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 v = _PyUnicode_New(size);
6440 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 e = s + size;
6446 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 register unsigned char c = (unsigned char)*s;
6448 if (c < 128) {
6449 *p++ = c;
6450 ++s;
6451 }
6452 else {
6453 startinpos = s-starts;
6454 endinpos = startinpos + 1;
6455 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6456 if (unicode_decode_call_errorhandler(
6457 errors, &errorHandler,
6458 "ascii", "ordinal not in range(128)",
6459 &starts, &e, &startinpos, &endinpos, &exc, &s,
6460 &v, &outpos, &p))
6461 goto onError;
6462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006464 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006465 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 Py_XDECREF(errorHandler);
6468 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006469 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006470 Py_DECREF(v);
6471 return NULL;
6472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006474
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 Py_XDECREF(errorHandler);
6478 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 return NULL;
6480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
6483PyUnicode_EncodeASCII(const Py_UNICODE *p,
6484 Py_ssize_t size,
6485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488}
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006491_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
6493 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 PyErr_BadArgument();
6495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497 if (PyUnicode_READY(unicode) == -1)
6498 return NULL;
6499 /* Fast path: if it is an ASCII-only string, construct bytes object
6500 directly. Else defer to above function to raise the exception. */
6501 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6502 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6503 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006506 errors);
6507}
6508
6509PyObject *
6510PyUnicode_AsASCIIString(PyObject *unicode)
6511{
6512 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513}
6514
Victor Stinner99b95382011-07-04 14:23:54 +02006515#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006516
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006517/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006518
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006519#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006520#define NEED_RETRY
6521#endif
6522
6523/* XXX This code is limited to "true" double-byte encodings, as
6524 a) it assumes an incomplete character consists of a single byte, and
6525 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006527
Alexander Belopolsky40018472011-02-26 01:02:56 +00006528static int
6529is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006530{
6531 const char *curr = s + offset;
6532
6533 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 const char *prev = CharPrev(s, curr);
6535 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536 }
6537 return 0;
6538}
6539
6540/*
6541 * Decode MBCS string into unicode object. If 'final' is set, converts
6542 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6543 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006544static int
6545decode_mbcs(PyUnicodeObject **v,
6546 const char *s, /* MBCS string */
6547 int size, /* sizeof MBCS string */
6548 int final,
6549 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006550{
6551 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006552 Py_ssize_t n;
6553 DWORD usize;
6554 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006555
6556 assert(size >= 0);
6557
Victor Stinner554f3f02010-06-16 23:33:54 +00006558 /* check and handle 'errors' arg */
6559 if (errors==NULL || strcmp(errors, "strict")==0)
6560 flags = MB_ERR_INVALID_CHARS;
6561 else if (strcmp(errors, "ignore")==0)
6562 flags = 0;
6563 else {
6564 PyErr_Format(PyExc_ValueError,
6565 "mbcs encoding does not support errors='%s'",
6566 errors);
6567 return -1;
6568 }
6569
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006570 /* Skip trailing lead-byte unless 'final' is set */
6571 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573
6574 /* First get the size of the result */
6575 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006576 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6577 if (usize==0)
6578 goto mbcs_decode_error;
6579 } else
6580 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006581
6582 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 /* Create unicode object */
6584 *v = _PyUnicode_New(usize);
6585 if (*v == NULL)
6586 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006587 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588 }
6589 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 /* Extend unicode object */
6591 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006592 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 }
6595
6596 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006597 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006599 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6600 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006603 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006604
6605mbcs_decode_error:
6606 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6607 we raise a UnicodeDecodeError - else it is a 'generic'
6608 windows error
6609 */
6610 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6611 /* Ideally, we should get reason from FormatMessage - this
6612 is the Windows 2000 English version of the message
6613 */
6614 PyObject *exc = NULL;
6615 const char *reason = "No mapping for the Unicode character exists "
6616 "in the target multi-byte code page.";
6617 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6618 if (exc != NULL) {
6619 PyCodec_StrictErrors(exc);
6620 Py_DECREF(exc);
6621 }
6622 } else {
6623 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6624 }
6625 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626}
6627
Alexander Belopolsky40018472011-02-26 01:02:56 +00006628PyObject *
6629PyUnicode_DecodeMBCSStateful(const char *s,
6630 Py_ssize_t size,
6631 const char *errors,
6632 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006633{
6634 PyUnicodeObject *v = NULL;
6635 int done;
6636
6637 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006639
6640#ifdef NEED_RETRY
6641 retry:
6642 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006643 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006644 else
6645#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006646 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006647
6648 if (done < 0) {
6649 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651 }
6652
6653 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006655
6656#ifdef NEED_RETRY
6657 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 s += done;
6659 size -= done;
6660 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661 }
6662#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006663 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664 Py_DECREF(v);
6665 return NULL;
6666 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667 return (PyObject *)v;
6668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670PyObject *
6671PyUnicode_DecodeMBCS(const char *s,
6672 Py_ssize_t size,
6673 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006674{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006675 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6676}
6677
6678/*
6679 * Convert unicode into string object (MBCS).
6680 * Returns 0 if succeed, -1 otherwise.
6681 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006682static int
6683encode_mbcs(PyObject **repr,
6684 const Py_UNICODE *p, /* unicode */
6685 int size, /* size of unicode */
6686 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006687{
Victor Stinner554f3f02010-06-16 23:33:54 +00006688 BOOL usedDefaultChar = FALSE;
6689 BOOL *pusedDefaultChar;
6690 int mbcssize;
6691 Py_ssize_t n;
6692 PyObject *exc = NULL;
6693 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694
6695 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006696
Victor Stinner554f3f02010-06-16 23:33:54 +00006697 /* check and handle 'errors' arg */
6698 if (errors==NULL || strcmp(errors, "strict")==0) {
6699 flags = WC_NO_BEST_FIT_CHARS;
6700 pusedDefaultChar = &usedDefaultChar;
6701 } else if (strcmp(errors, "replace")==0) {
6702 flags = 0;
6703 pusedDefaultChar = NULL;
6704 } else {
6705 PyErr_Format(PyExc_ValueError,
6706 "mbcs encoding does not support errors='%s'",
6707 errors);
6708 return -1;
6709 }
6710
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006711 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006712 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006713 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6714 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 if (mbcssize == 0) {
6716 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6717 return -1;
6718 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006719 /* If we used a default char, then we failed! */
6720 if (pusedDefaultChar && *pusedDefaultChar)
6721 goto mbcs_encode_error;
6722 } else {
6723 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006724 }
6725
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* Create string object */
6728 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6729 if (*repr == NULL)
6730 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006731 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 }
6733 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 /* Extend string object */
6735 n = PyBytes_Size(*repr);
6736 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6737 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006738 }
6739
6740 /* Do the conversion */
6741 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006743 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6744 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6746 return -1;
6747 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006748 if (pusedDefaultChar && *pusedDefaultChar)
6749 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006751 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006752
6753mbcs_encode_error:
6754 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6755 Py_XDECREF(exc);
6756 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759PyObject *
6760PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6761 Py_ssize_t size,
6762 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006763{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006764 PyObject *repr = NULL;
6765 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006766
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006770 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771 else
6772#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006773 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006774
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006775 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 Py_XDECREF(repr);
6777 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006778 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779
6780#ifdef NEED_RETRY
6781 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 p += INT_MAX;
6783 size -= INT_MAX;
6784 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785 }
6786#endif
6787
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006788 return repr;
6789}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006790
Alexander Belopolsky40018472011-02-26 01:02:56 +00006791PyObject *
6792PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006793{
6794 if (!PyUnicode_Check(unicode)) {
6795 PyErr_BadArgument();
6796 return NULL;
6797 }
6798 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 PyUnicode_GET_SIZE(unicode),
6800 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006801}
6802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803#undef NEED_RETRY
6804
Victor Stinner99b95382011-07-04 14:23:54 +02006805#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006806
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807/* --- Character Mapping Codec -------------------------------------------- */
6808
Alexander Belopolsky40018472011-02-26 01:02:56 +00006809PyObject *
6810PyUnicode_DecodeCharmap(const char *s,
6811 Py_ssize_t size,
6812 PyObject *mapping,
6813 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006815 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006816 Py_ssize_t startinpos;
6817 Py_ssize_t endinpos;
6818 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006819 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 PyUnicodeObject *v;
6821 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006822 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823 PyObject *errorHandler = NULL;
6824 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006825 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006826 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 /* Default to Latin-1 */
6829 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
6832 v = _PyUnicode_New(size);
6833 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006839 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 mapstring = PyUnicode_AS_UNICODE(mapping);
6841 maplen = PyUnicode_GET_SIZE(mapping);
6842 while (s < e) {
6843 unsigned char ch = *s;
6844 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 if (ch < maplen)
6847 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 if (x == 0xfffe) {
6850 /* undefined mapping */
6851 outpos = p-PyUnicode_AS_UNICODE(v);
6852 startinpos = s-starts;
6853 endinpos = startinpos+1;
6854 if (unicode_decode_call_errorhandler(
6855 errors, &errorHandler,
6856 "charmap", "character maps to <undefined>",
6857 &starts, &e, &startinpos, &endinpos, &exc, &s,
6858 &v, &outpos, &p)) {
6859 goto onError;
6860 }
6861 continue;
6862 }
6863 *p++ = x;
6864 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006865 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006866 }
6867 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 while (s < e) {
6869 unsigned char ch = *s;
6870 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6873 w = PyLong_FromLong((long)ch);
6874 if (w == NULL)
6875 goto onError;
6876 x = PyObject_GetItem(mapping, w);
6877 Py_DECREF(w);
6878 if (x == NULL) {
6879 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6880 /* No mapping found means: mapping is undefined. */
6881 PyErr_Clear();
6882 x = Py_None;
6883 Py_INCREF(x);
6884 } else
6885 goto onError;
6886 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006887
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 /* Apply mapping */
6889 if (PyLong_Check(x)) {
6890 long value = PyLong_AS_LONG(x);
6891 if (value < 0 || value > 65535) {
6892 PyErr_SetString(PyExc_TypeError,
6893 "character mapping must be in range(65536)");
6894 Py_DECREF(x);
6895 goto onError;
6896 }
6897 *p++ = (Py_UNICODE)value;
6898 }
6899 else if (x == Py_None) {
6900 /* undefined mapping */
6901 outpos = p-PyUnicode_AS_UNICODE(v);
6902 startinpos = s-starts;
6903 endinpos = startinpos+1;
6904 if (unicode_decode_call_errorhandler(
6905 errors, &errorHandler,
6906 "charmap", "character maps to <undefined>",
6907 &starts, &e, &startinpos, &endinpos, &exc, &s,
6908 &v, &outpos, &p)) {
6909 Py_DECREF(x);
6910 goto onError;
6911 }
6912 Py_DECREF(x);
6913 continue;
6914 }
6915 else if (PyUnicode_Check(x)) {
6916 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006917
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 if (targetsize == 1)
6919 /* 1-1 mapping */
6920 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006921
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 else if (targetsize > 1) {
6923 /* 1-n mapping */
6924 if (targetsize > extrachars) {
6925 /* resize first */
6926 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6927 Py_ssize_t needed = (targetsize - extrachars) + \
6928 (targetsize << 2);
6929 extrachars += needed;
6930 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006931 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 PyUnicode_GET_SIZE(v) + needed) < 0) {
6933 Py_DECREF(x);
6934 goto onError;
6935 }
6936 p = PyUnicode_AS_UNICODE(v) + oldpos;
6937 }
6938 Py_UNICODE_COPY(p,
6939 PyUnicode_AS_UNICODE(x),
6940 targetsize);
6941 p += targetsize;
6942 extrachars -= targetsize;
6943 }
6944 /* 1-0 mapping: skip the character */
6945 }
6946 else {
6947 /* wrong return value */
6948 PyErr_SetString(PyExc_TypeError,
6949 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006950 Py_DECREF(x);
6951 goto onError;
6952 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 Py_DECREF(x);
6954 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
6957 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006958 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 Py_XDECREF(errorHandler);
6961 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006962 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 Py_DECREF(v);
6964 return NULL;
6965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006967
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 Py_XDECREF(errorHandler);
6970 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 Py_XDECREF(v);
6972 return NULL;
6973}
6974
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006975/* Charmap encoding: the lookup table */
6976
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 PyObject_HEAD
6979 unsigned char level1[32];
6980 int count2, count3;
6981 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006982};
6983
6984static PyObject*
6985encoding_map_size(PyObject *obj, PyObject* args)
6986{
6987 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006990}
6991
6992static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006993 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 PyDoc_STR("Return the size (in bytes) of this object") },
6995 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006996};
6997
6998static void
6999encoding_map_dealloc(PyObject* o)
7000{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007001 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007002}
7003
7004static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 "EncodingMap", /*tp_name*/
7007 sizeof(struct encoding_map), /*tp_basicsize*/
7008 0, /*tp_itemsize*/
7009 /* methods */
7010 encoding_map_dealloc, /*tp_dealloc*/
7011 0, /*tp_print*/
7012 0, /*tp_getattr*/
7013 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007014 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 0, /*tp_repr*/
7016 0, /*tp_as_number*/
7017 0, /*tp_as_sequence*/
7018 0, /*tp_as_mapping*/
7019 0, /*tp_hash*/
7020 0, /*tp_call*/
7021 0, /*tp_str*/
7022 0, /*tp_getattro*/
7023 0, /*tp_setattro*/
7024 0, /*tp_as_buffer*/
7025 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7026 0, /*tp_doc*/
7027 0, /*tp_traverse*/
7028 0, /*tp_clear*/
7029 0, /*tp_richcompare*/
7030 0, /*tp_weaklistoffset*/
7031 0, /*tp_iter*/
7032 0, /*tp_iternext*/
7033 encoding_map_methods, /*tp_methods*/
7034 0, /*tp_members*/
7035 0, /*tp_getset*/
7036 0, /*tp_base*/
7037 0, /*tp_dict*/
7038 0, /*tp_descr_get*/
7039 0, /*tp_descr_set*/
7040 0, /*tp_dictoffset*/
7041 0, /*tp_init*/
7042 0, /*tp_alloc*/
7043 0, /*tp_new*/
7044 0, /*tp_free*/
7045 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007046};
7047
7048PyObject*
7049PyUnicode_BuildEncodingMap(PyObject* string)
7050{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007051 PyObject *result;
7052 struct encoding_map *mresult;
7053 int i;
7054 int need_dict = 0;
7055 unsigned char level1[32];
7056 unsigned char level2[512];
7057 unsigned char *mlevel1, *mlevel2, *mlevel3;
7058 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007059 int kind;
7060 void *data;
7061 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007063 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007064 PyErr_BadArgument();
7065 return NULL;
7066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007067 kind = PyUnicode_KIND(string);
7068 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007069 memset(level1, 0xFF, sizeof level1);
7070 memset(level2, 0xFF, sizeof level2);
7071
7072 /* If there isn't a one-to-one mapping of NULL to \0,
7073 or if there are non-BMP characters, we need to use
7074 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007075 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007076 need_dict = 1;
7077 for (i = 1; i < 256; i++) {
7078 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007079 ch = PyUnicode_READ(kind, data, i);
7080 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007081 need_dict = 1;
7082 break;
7083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007084 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007085 /* unmapped character */
7086 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007087 l1 = ch >> 11;
7088 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007089 if (level1[l1] == 0xFF)
7090 level1[l1] = count2++;
7091 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007092 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007093 }
7094
7095 if (count2 >= 0xFF || count3 >= 0xFF)
7096 need_dict = 1;
7097
7098 if (need_dict) {
7099 PyObject *result = PyDict_New();
7100 PyObject *key, *value;
7101 if (!result)
7102 return NULL;
7103 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007104 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007105 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007106 if (!key || !value)
7107 goto failed1;
7108 if (PyDict_SetItem(result, key, value) == -1)
7109 goto failed1;
7110 Py_DECREF(key);
7111 Py_DECREF(value);
7112 }
7113 return result;
7114 failed1:
7115 Py_XDECREF(key);
7116 Py_XDECREF(value);
7117 Py_DECREF(result);
7118 return NULL;
7119 }
7120
7121 /* Create a three-level trie */
7122 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7123 16*count2 + 128*count3 - 1);
7124 if (!result)
7125 return PyErr_NoMemory();
7126 PyObject_Init(result, &EncodingMapType);
7127 mresult = (struct encoding_map*)result;
7128 mresult->count2 = count2;
7129 mresult->count3 = count3;
7130 mlevel1 = mresult->level1;
7131 mlevel2 = mresult->level23;
7132 mlevel3 = mresult->level23 + 16*count2;
7133 memcpy(mlevel1, level1, 32);
7134 memset(mlevel2, 0xFF, 16*count2);
7135 memset(mlevel3, 0, 128*count3);
7136 count3 = 0;
7137 for (i = 1; i < 256; i++) {
7138 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007139 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007140 /* unmapped character */
7141 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 o1 = PyUnicode_READ(kind, data, i)>>11;
7143 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007144 i2 = 16*mlevel1[o1] + o2;
7145 if (mlevel2[i2] == 0xFF)
7146 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007147 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007148 i3 = 128*mlevel2[i2] + o3;
7149 mlevel3[i3] = i;
7150 }
7151 return result;
7152}
7153
7154static int
7155encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7156{
7157 struct encoding_map *map = (struct encoding_map*)mapping;
7158 int l1 = c>>11;
7159 int l2 = (c>>7) & 0xF;
7160 int l3 = c & 0x7F;
7161 int i;
7162
7163#ifdef Py_UNICODE_WIDE
7164 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007166 }
7167#endif
7168 if (c == 0)
7169 return 0;
7170 /* level 1*/
7171 i = map->level1[l1];
7172 if (i == 0xFF) {
7173 return -1;
7174 }
7175 /* level 2*/
7176 i = map->level23[16*i+l2];
7177 if (i == 0xFF) {
7178 return -1;
7179 }
7180 /* level 3 */
7181 i = map->level23[16*map->count2 + 128*i + l3];
7182 if (i == 0) {
7183 return -1;
7184 }
7185 return i;
7186}
7187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007188/* Lookup the character ch in the mapping. If the character
7189 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007190 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007191static PyObject *
7192charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193{
Christian Heimes217cfd12007-12-02 14:31:20 +00007194 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195 PyObject *x;
7196
7197 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 x = PyObject_GetItem(mapping, w);
7200 Py_DECREF(w);
7201 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7203 /* No mapping found means: mapping is undefined. */
7204 PyErr_Clear();
7205 x = Py_None;
7206 Py_INCREF(x);
7207 return x;
7208 } else
7209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007211 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007213 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 long value = PyLong_AS_LONG(x);
7215 if (value < 0 || value > 255) {
7216 PyErr_SetString(PyExc_TypeError,
7217 "character mapping must be in range(256)");
7218 Py_DECREF(x);
7219 return NULL;
7220 }
7221 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007223 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 /* wrong return value */
7227 PyErr_Format(PyExc_TypeError,
7228 "character mapping must return integer, bytes or None, not %.400s",
7229 x->ob_type->tp_name);
7230 Py_DECREF(x);
7231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 }
7233}
7234
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007235static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007236charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007237{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007238 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7239 /* exponentially overallocate to minimize reallocations */
7240 if (requiredsize < 2*outsize)
7241 requiredsize = 2*outsize;
7242 if (_PyBytes_Resize(outobj, requiredsize))
7243 return -1;
7244 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007245}
7246
Benjamin Peterson14339b62009-01-31 16:36:08 +00007247typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007249} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007251 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 space is available. Return a new reference to the object that
7253 was put in the output buffer, or Py_None, if the mapping was undefined
7254 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007255 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256static charmapencode_result
7257charmapencode_output(Py_UNICODE c, PyObject *mapping,
7258 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007260 PyObject *rep;
7261 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007262 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263
Christian Heimes90aa7642007-12-19 02:45:37 +00007264 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007265 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 if (res == -1)
7268 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 if (outsize<requiredsize)
7270 if (charmapencode_resize(outobj, outpos, requiredsize))
7271 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007272 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 outstart[(*outpos)++] = (char)res;
7274 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007275 }
7276
7277 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007280 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 Py_DECREF(rep);
7282 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007283 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 if (PyLong_Check(rep)) {
7285 Py_ssize_t requiredsize = *outpos+1;
7286 if (outsize<requiredsize)
7287 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7288 Py_DECREF(rep);
7289 return enc_EXCEPTION;
7290 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007291 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 else {
7295 const char *repchars = PyBytes_AS_STRING(rep);
7296 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7297 Py_ssize_t requiredsize = *outpos+repsize;
7298 if (outsize<requiredsize)
7299 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7300 Py_DECREF(rep);
7301 return enc_EXCEPTION;
7302 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007303 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 memcpy(outstart + *outpos, repchars, repsize);
7305 *outpos += repsize;
7306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007308 Py_DECREF(rep);
7309 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310}
7311
7312/* handle an error in PyUnicode_EncodeCharmap
7313 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007314static int
7315charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007316 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007317 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007318 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007319 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007320{
7321 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007322 Py_ssize_t repsize;
7323 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324 Py_UNICODE *uni2;
7325 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007326 Py_ssize_t collstartpos = *inpos;
7327 Py_ssize_t collendpos = *inpos+1;
7328 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329 char *encoding = "charmap";
7330 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007331 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007333 /* find all unencodable characters */
7334 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007335 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007336 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 int res = encoding_map_lookup(p[collendpos], mapping);
7338 if (res != -1)
7339 break;
7340 ++collendpos;
7341 continue;
7342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 rep = charmapencode_lookup(p[collendpos], mapping);
7345 if (rep==NULL)
7346 return -1;
7347 else if (rep!=Py_None) {
7348 Py_DECREF(rep);
7349 break;
7350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007351 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 }
7354 /* cache callback name lookup
7355 * (if not done yet, i.e. it's the first error) */
7356 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 if ((errors==NULL) || (!strcmp(errors, "strict")))
7358 *known_errorHandler = 1;
7359 else if (!strcmp(errors, "replace"))
7360 *known_errorHandler = 2;
7361 else if (!strcmp(errors, "ignore"))
7362 *known_errorHandler = 3;
7363 else if (!strcmp(errors, "xmlcharrefreplace"))
7364 *known_errorHandler = 4;
7365 else
7366 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367 }
7368 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007369 case 1: /* strict */
7370 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7371 return -1;
7372 case 2: /* replace */
7373 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 x = charmapencode_output('?', mapping, res, respos);
7375 if (x==enc_EXCEPTION) {
7376 return -1;
7377 }
7378 else if (x==enc_FAILED) {
7379 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7380 return -1;
7381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 }
7383 /* fall through */
7384 case 3: /* ignore */
7385 *inpos = collendpos;
7386 break;
7387 case 4: /* xmlcharrefreplace */
7388 /* generate replacement (temporarily (mis)uses p) */
7389 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 char buffer[2+29+1+1];
7391 char *cp;
7392 sprintf(buffer, "&#%d;", (int)p[collpos]);
7393 for (cp = buffer; *cp; ++cp) {
7394 x = charmapencode_output(*cp, mapping, res, respos);
7395 if (x==enc_EXCEPTION)
7396 return -1;
7397 else if (x==enc_FAILED) {
7398 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7399 return -1;
7400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007401 }
7402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007403 *inpos = collendpos;
7404 break;
7405 default:
7406 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 encoding, reason, p, size, exceptionObject,
7408 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007409 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007411 if (PyBytes_Check(repunicode)) {
7412 /* Directly copy bytes result to output. */
7413 Py_ssize_t outsize = PyBytes_Size(*res);
7414 Py_ssize_t requiredsize;
7415 repsize = PyBytes_Size(repunicode);
7416 requiredsize = *respos + repsize;
7417 if (requiredsize > outsize)
7418 /* Make room for all additional bytes. */
7419 if (charmapencode_resize(res, respos, requiredsize)) {
7420 Py_DECREF(repunicode);
7421 return -1;
7422 }
7423 memcpy(PyBytes_AsString(*res) + *respos,
7424 PyBytes_AsString(repunicode), repsize);
7425 *respos += repsize;
7426 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007427 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007428 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007430 /* generate replacement */
7431 repsize = PyUnicode_GET_SIZE(repunicode);
7432 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 x = charmapencode_output(*uni2, mapping, res, respos);
7434 if (x==enc_EXCEPTION) {
7435 return -1;
7436 }
7437 else if (x==enc_FAILED) {
7438 Py_DECREF(repunicode);
7439 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7440 return -1;
7441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007442 }
7443 *inpos = newpos;
7444 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445 }
7446 return 0;
7447}
7448
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449PyObject *
7450PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7451 Py_ssize_t size,
7452 PyObject *mapping,
7453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007455 /* output object */
7456 PyObject *res = NULL;
7457 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007458 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007459 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007460 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461 PyObject *errorHandler = NULL;
7462 PyObject *exc = NULL;
7463 /* the following variable is used for caching string comparisons
7464 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7465 * 3=ignore, 4=xmlcharrefreplace */
7466 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468 /* Default to Latin-1 */
7469 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007472 /* allocate enough for a simple encoding without
7473 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007474 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007475 if (res == NULL)
7476 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007477 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007480 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 /* try to encode it */
7482 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7483 if (x==enc_EXCEPTION) /* error */
7484 goto onError;
7485 if (x==enc_FAILED) { /* unencodable character */
7486 if (charmap_encoding_error(p, size, &inpos, mapping,
7487 &exc,
7488 &known_errorHandler, &errorHandler, errors,
7489 &res, &respos)) {
7490 goto onError;
7491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007492 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 else
7494 /* done with this character => adjust input position */
7495 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007499 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007500 if (_PyBytes_Resize(&res, respos) < 0)
7501 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 Py_XDECREF(exc);
7504 Py_XDECREF(errorHandler);
7505 return res;
7506
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 Py_XDECREF(res);
7509 Py_XDECREF(exc);
7510 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 return NULL;
7512}
7513
Alexander Belopolsky40018472011-02-26 01:02:56 +00007514PyObject *
7515PyUnicode_AsCharmapString(PyObject *unicode,
7516 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517{
7518 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 PyErr_BadArgument();
7520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 }
7522 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 PyUnicode_GET_SIZE(unicode),
7524 mapping,
7525 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526}
7527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007529static void
7530make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007531 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007532 Py_ssize_t startpos, Py_ssize_t endpos,
7533 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007535 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536 *exceptionObject = _PyUnicodeTranslateError_Create(
7537 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 }
7539 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7541 goto onError;
7542 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7543 goto onError;
7544 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7545 goto onError;
7546 return;
7547 onError:
7548 Py_DECREF(*exceptionObject);
7549 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 }
7551}
7552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007554static void
7555raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007557 Py_ssize_t startpos, Py_ssize_t endpos,
7558 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559{
7560 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007561 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007562 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564}
7565
7566/* error handling callback helper:
7567 build arguments, call the callback and check the arguments,
7568 put the result into newpos and return the replacement string, which
7569 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007570static PyObject *
7571unicode_translate_call_errorhandler(const char *errors,
7572 PyObject **errorHandler,
7573 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007574 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007575 Py_ssize_t startpos, Py_ssize_t endpos,
7576 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007578 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007580 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581 PyObject *restuple;
7582 PyObject *resunicode;
7583
7584 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 }
7589
7590 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007591 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594
7595 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007597 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007600 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 Py_DECREF(restuple);
7602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603 }
7604 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 &resunicode, &i_newpos)) {
7606 Py_DECREF(restuple);
7607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007608 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007609 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007610 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007611 else
7612 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007613 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7615 Py_DECREF(restuple);
7616 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007618 Py_INCREF(resunicode);
7619 Py_DECREF(restuple);
7620 return resunicode;
7621}
7622
7623/* Lookup the character ch in the mapping and put the result in result,
7624 which must be decrefed by the caller.
7625 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007626static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628{
Christian Heimes217cfd12007-12-02 14:31:20 +00007629 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630 PyObject *x;
7631
7632 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 x = PyObject_GetItem(mapping, w);
7635 Py_DECREF(w);
7636 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7638 /* No mapping found means: use 1:1 mapping. */
7639 PyErr_Clear();
7640 *result = NULL;
7641 return 0;
7642 } else
7643 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007644 }
7645 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 *result = x;
7647 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007649 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 long value = PyLong_AS_LONG(x);
7651 long max = PyUnicode_GetMax();
7652 if (value < 0 || value > max) {
7653 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007654 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 Py_DECREF(x);
7656 return -1;
7657 }
7658 *result = x;
7659 return 0;
7660 }
7661 else if (PyUnicode_Check(x)) {
7662 *result = x;
7663 return 0;
7664 }
7665 else {
7666 /* wrong return value */
7667 PyErr_SetString(PyExc_TypeError,
7668 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 Py_DECREF(x);
7670 return -1;
7671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672}
7673/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 if not reallocate and adjust various state variables.
7675 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007676static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007677charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007681 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 /* exponentially overallocate to minimize reallocations */
7683 if (requiredsize < 2 * oldsize)
7684 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7686 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 }
7690 return 0;
7691}
7692/* lookup the character, put the result in the output string and adjust
7693 various state variables. Return a new reference to the object that
7694 was put in the output buffer in *result, or Py_None, if the mapping was
7695 undefined (in which case no character was written).
7696 The called must decref result.
7697 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007698static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007699charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7700 PyObject *mapping, Py_UCS4 **output,
7701 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007702 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007704 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7705 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007709 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 }
7711 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007713 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716 }
7717 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007718 Py_ssize_t repsize;
7719 if (PyUnicode_READY(*res) == -1)
7720 return -1;
7721 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 if (repsize==1) {
7723 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 }
7726 else if (repsize!=0) {
7727 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 Py_ssize_t requiredsize = *opos +
7729 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007731 Py_ssize_t i;
7732 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 for(i = 0; i < repsize; i++)
7735 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737 }
7738 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 return 0;
7741}
7742
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007744_PyUnicode_TranslateCharmap(PyObject *input,
7745 PyObject *mapping,
7746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007748 /* input object */
7749 char *idata;
7750 Py_ssize_t size, i;
7751 int kind;
7752 /* output buffer */
7753 Py_UCS4 *output = NULL;
7754 Py_ssize_t osize;
7755 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007757 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758 char *reason = "character maps to <undefined>";
7759 PyObject *errorHandler = NULL;
7760 PyObject *exc = NULL;
7761 /* the following variable is used for caching string comparisons
7762 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7763 * 3=ignore, 4=xmlcharrefreplace */
7764 int known_errorHandler = -1;
7765
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 PyErr_BadArgument();
7768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 if (PyUnicode_READY(input) == -1)
7772 return NULL;
7773 idata = (char*)PyUnicode_DATA(input);
7774 kind = PyUnicode_KIND(input);
7775 size = PyUnicode_GET_LENGTH(input);
7776 i = 0;
7777
7778 if (size == 0) {
7779 Py_INCREF(input);
7780 return input;
7781 }
7782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007783 /* allocate enough for a simple 1:1 translation without
7784 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007785 osize = size;
7786 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7787 opos = 0;
7788 if (output == NULL) {
7789 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 /* try to encode it */
7795 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796 if (charmaptranslate_output(input, i, mapping,
7797 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 Py_XDECREF(x);
7799 goto onError;
7800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 else { /* untranslatable character */
7805 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7806 Py_ssize_t repsize;
7807 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007808 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 Py_ssize_t collstart = i;
7811 Py_ssize_t collend = i+1;
7812 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007815 while (collend < size) {
7816 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007817 goto onError;
7818 Py_XDECREF(x);
7819 if (x!=Py_None)
7820 break;
7821 ++collend;
7822 }
7823 /* cache callback name lookup
7824 * (if not done yet, i.e. it's the first error) */
7825 if (known_errorHandler==-1) {
7826 if ((errors==NULL) || (!strcmp(errors, "strict")))
7827 known_errorHandler = 1;
7828 else if (!strcmp(errors, "replace"))
7829 known_errorHandler = 2;
7830 else if (!strcmp(errors, "ignore"))
7831 known_errorHandler = 3;
7832 else if (!strcmp(errors, "xmlcharrefreplace"))
7833 known_errorHandler = 4;
7834 else
7835 known_errorHandler = 0;
7836 }
7837 switch (known_errorHandler) {
7838 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 raise_translate_exception(&exc, input, collstart,
7840 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 case 2: /* replace */
7843 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 for (coll = collstart; coll<collend; coll++)
7845 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 /* fall through */
7847 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 break;
7850 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 /* generate replacement (temporarily (mis)uses i) */
7852 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 char buffer[2+29+1+1];
7854 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7856 if (charmaptranslate_makespace(&output, &osize,
7857 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 goto onError;
7859 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 break;
7864 default:
7865 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 reason, input, &exc,
7867 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007868 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 goto onError;
7870 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 repsize = PyUnicode_GET_LENGTH(repunicode);
7872 if (charmaptranslate_makespace(&output, &osize,
7873 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 Py_DECREF(repunicode);
7875 goto onError;
7876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 for (uni2 = 0; repsize-->0; ++uni2)
7878 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7879 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007882 }
7883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7885 if (!res)
7886 goto onError;
7887 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 Py_XDECREF(exc);
7889 Py_XDECREF(errorHandler);
7890 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 Py_XDECREF(exc);
7895 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 return NULL;
7897}
7898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007899/* Deprecated. Use PyUnicode_Translate instead. */
7900PyObject *
7901PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7902 Py_ssize_t size,
7903 PyObject *mapping,
7904 const char *errors)
7905{
7906 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7907 if (!unicode)
7908 return NULL;
7909 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7910}
7911
Alexander Belopolsky40018472011-02-26 01:02:56 +00007912PyObject *
7913PyUnicode_Translate(PyObject *str,
7914 PyObject *mapping,
7915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916{
7917 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007918
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 str = PyUnicode_FromObject(str);
7920 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 Py_DECREF(str);
7924 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007925
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 Py_XDECREF(str);
7928 return NULL;
7929}
Tim Petersced69f82003-09-16 20:30:58 +00007930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931static Py_UCS4
7932fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7933{
7934 /* No need to call PyUnicode_READY(self) because this function is only
7935 called as a callback from fixup() which does it already. */
7936 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7937 const int kind = PyUnicode_KIND(self);
7938 void *data = PyUnicode_DATA(self);
7939 Py_UCS4 maxchar = 0, ch, fixed;
7940 Py_ssize_t i;
7941
7942 for (i = 0; i < len; ++i) {
7943 ch = PyUnicode_READ(kind, data, i);
7944 fixed = 0;
7945 if (ch > 127) {
7946 if (Py_UNICODE_ISSPACE(ch))
7947 fixed = ' ';
7948 else {
7949 const int decimal = Py_UNICODE_TODECIMAL(ch);
7950 if (decimal >= 0)
7951 fixed = '0' + decimal;
7952 }
7953 if (fixed != 0) {
7954 if (fixed > maxchar)
7955 maxchar = fixed;
7956 PyUnicode_WRITE(kind, data, i, fixed);
7957 }
7958 else if (ch > maxchar)
7959 maxchar = ch;
7960 }
7961 else if (ch > maxchar)
7962 maxchar = ch;
7963 }
7964
7965 return maxchar;
7966}
7967
7968PyObject *
7969_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7970{
7971 if (!PyUnicode_Check(unicode)) {
7972 PyErr_BadInternalCall();
7973 return NULL;
7974 }
7975 if (PyUnicode_READY(unicode) == -1)
7976 return NULL;
7977 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7978 /* If the string is already ASCII, just return the same string */
7979 Py_INCREF(unicode);
7980 return unicode;
7981 }
7982 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7983}
7984
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007985PyObject *
7986PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7987 Py_ssize_t length)
7988{
7989 PyObject *result;
7990 Py_UNICODE *p; /* write pointer into result */
7991 Py_ssize_t i;
7992 /* Copy to a new string */
7993 result = (PyObject *)_PyUnicode_New(length);
7994 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7995 if (result == NULL)
7996 return result;
7997 p = PyUnicode_AS_UNICODE(result);
7998 /* Iterate over code points */
7999 for (i = 0; i < length; i++) {
8000 Py_UNICODE ch =s[i];
8001 if (ch > 127) {
8002 int decimal = Py_UNICODE_TODECIMAL(ch);
8003 if (decimal >= 0)
8004 p[i] = '0' + decimal;
8005 }
8006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008007 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8008 Py_DECREF(result);
8009 return NULL;
8010 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008011 return result;
8012}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008013/* --- Decimal Encoder ---------------------------------------------------- */
8014
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015int
8016PyUnicode_EncodeDecimal(Py_UNICODE *s,
8017 Py_ssize_t length,
8018 char *output,
8019 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008020{
8021 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022 PyObject *errorHandler = NULL;
8023 PyObject *exc = NULL;
8024 const char *encoding = "decimal";
8025 const char *reason = "invalid decimal Unicode string";
8026 /* the following variable is used for caching string comparisons
8027 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8028 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008029
8030 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 PyErr_BadArgument();
8032 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008033 }
8034
8035 p = s;
8036 end = s + length;
8037 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 register Py_UNICODE ch = *p;
8039 int decimal;
8040 PyObject *repunicode;
8041 Py_ssize_t repsize;
8042 Py_ssize_t newpos;
8043 Py_UNICODE *uni2;
8044 Py_UNICODE *collstart;
8045 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008046
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 ++p;
8050 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 decimal = Py_UNICODE_TODECIMAL(ch);
8053 if (decimal >= 0) {
8054 *output++ = '0' + decimal;
8055 ++p;
8056 continue;
8057 }
8058 if (0 < ch && ch < 256) {
8059 *output++ = (char)ch;
8060 ++p;
8061 continue;
8062 }
8063 /* All other characters are considered unencodable */
8064 collstart = p;
8065 collend = p+1;
8066 while (collend < end) {
8067 if ((0 < *collend && *collend < 256) ||
8068 !Py_UNICODE_ISSPACE(*collend) ||
8069 Py_UNICODE_TODECIMAL(*collend))
8070 break;
8071 }
8072 /* cache callback name lookup
8073 * (if not done yet, i.e. it's the first error) */
8074 if (known_errorHandler==-1) {
8075 if ((errors==NULL) || (!strcmp(errors, "strict")))
8076 known_errorHandler = 1;
8077 else if (!strcmp(errors, "replace"))
8078 known_errorHandler = 2;
8079 else if (!strcmp(errors, "ignore"))
8080 known_errorHandler = 3;
8081 else if (!strcmp(errors, "xmlcharrefreplace"))
8082 known_errorHandler = 4;
8083 else
8084 known_errorHandler = 0;
8085 }
8086 switch (known_errorHandler) {
8087 case 1: /* strict */
8088 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8089 goto onError;
8090 case 2: /* replace */
8091 for (p = collstart; p < collend; ++p)
8092 *output++ = '?';
8093 /* fall through */
8094 case 3: /* ignore */
8095 p = collend;
8096 break;
8097 case 4: /* xmlcharrefreplace */
8098 /* generate replacement (temporarily (mis)uses p) */
8099 for (p = collstart; p < collend; ++p)
8100 output += sprintf(output, "&#%d;", (int)*p);
8101 p = collend;
8102 break;
8103 default:
8104 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8105 encoding, reason, s, length, &exc,
8106 collstart-s, collend-s, &newpos);
8107 if (repunicode == NULL)
8108 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008109 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008110 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008111 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8112 Py_DECREF(repunicode);
8113 goto onError;
8114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 /* generate replacement */
8116 repsize = PyUnicode_GET_SIZE(repunicode);
8117 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8118 Py_UNICODE ch = *uni2;
8119 if (Py_UNICODE_ISSPACE(ch))
8120 *output++ = ' ';
8121 else {
8122 decimal = Py_UNICODE_TODECIMAL(ch);
8123 if (decimal >= 0)
8124 *output++ = '0' + decimal;
8125 else if (0 < ch && ch < 256)
8126 *output++ = (char)ch;
8127 else {
8128 Py_DECREF(repunicode);
8129 raise_encode_exception(&exc, encoding,
8130 s, length, collstart-s, collend-s, reason);
8131 goto onError;
8132 }
8133 }
8134 }
8135 p = s + newpos;
8136 Py_DECREF(repunicode);
8137 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008138 }
8139 /* 0-terminate the output string */
8140 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 Py_XDECREF(exc);
8142 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008143 return 0;
8144
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 Py_XDECREF(exc);
8147 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008148 return -1;
8149}
8150
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151/* --- Helpers ------------------------------------------------------------ */
8152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153#include "stringlib/ucs1lib.h"
8154#include "stringlib/fastsearch.h"
8155#include "stringlib/partition.h"
8156#include "stringlib/split.h"
8157#include "stringlib/count.h"
8158#include "stringlib/find.h"
8159#include "stringlib/localeutil.h"
8160#include "stringlib/undef.h"
8161
8162#include "stringlib/ucs2lib.h"
8163#include "stringlib/fastsearch.h"
8164#include "stringlib/partition.h"
8165#include "stringlib/split.h"
8166#include "stringlib/count.h"
8167#include "stringlib/find.h"
8168#include "stringlib/localeutil.h"
8169#include "stringlib/undef.h"
8170
8171#include "stringlib/ucs4lib.h"
8172#include "stringlib/fastsearch.h"
8173#include "stringlib/partition.h"
8174#include "stringlib/split.h"
8175#include "stringlib/count.h"
8176#include "stringlib/find.h"
8177#include "stringlib/localeutil.h"
8178#include "stringlib/undef.h"
8179
8180static Py_ssize_t
8181any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8182 const Py_UCS1*, Py_ssize_t,
8183 Py_ssize_t, Py_ssize_t),
8184 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8185 const Py_UCS2*, Py_ssize_t,
8186 Py_ssize_t, Py_ssize_t),
8187 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8188 const Py_UCS4*, Py_ssize_t,
8189 Py_ssize_t, Py_ssize_t),
8190 PyObject* s1, PyObject* s2,
8191 Py_ssize_t start,
8192 Py_ssize_t end)
8193{
8194 int kind1, kind2, kind;
8195 void *buf1, *buf2;
8196 Py_ssize_t len1, len2, result;
8197
8198 kind1 = PyUnicode_KIND(s1);
8199 kind2 = PyUnicode_KIND(s2);
8200 kind = kind1 > kind2 ? kind1 : kind2;
8201 buf1 = PyUnicode_DATA(s1);
8202 buf2 = PyUnicode_DATA(s2);
8203 if (kind1 != kind)
8204 buf1 = _PyUnicode_AsKind(s1, kind);
8205 if (!buf1)
8206 return -2;
8207 if (kind2 != kind)
8208 buf2 = _PyUnicode_AsKind(s2, kind);
8209 if (!buf2) {
8210 if (kind1 != kind) PyMem_Free(buf1);
8211 return -2;
8212 }
8213 len1 = PyUnicode_GET_LENGTH(s1);
8214 len2 = PyUnicode_GET_LENGTH(s2);
8215
8216 switch(kind) {
8217 case PyUnicode_1BYTE_KIND:
8218 result = ucs1(buf1, len1, buf2, len2, start, end);
8219 break;
8220 case PyUnicode_2BYTE_KIND:
8221 result = ucs2(buf1, len1, buf2, len2, start, end);
8222 break;
8223 case PyUnicode_4BYTE_KIND:
8224 result = ucs4(buf1, len1, buf2, len2, start, end);
8225 break;
8226 default:
8227 assert(0); result = -2;
8228 }
8229
8230 if (kind1 != kind)
8231 PyMem_Free(buf1);
8232 if (kind2 != kind)
8233 PyMem_Free(buf2);
8234
8235 return result;
8236}
8237
8238Py_ssize_t
8239_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8240 Py_ssize_t n_buffer,
8241 void *digits, Py_ssize_t n_digits,
8242 Py_ssize_t min_width,
8243 const char *grouping,
8244 const char *thousands_sep)
8245{
8246 switch(kind) {
8247 case PyUnicode_1BYTE_KIND:
8248 return _PyUnicode_ucs1_InsertThousandsGrouping(
8249 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8250 min_width, grouping, thousands_sep);
8251 case PyUnicode_2BYTE_KIND:
8252 return _PyUnicode_ucs2_InsertThousandsGrouping(
8253 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8254 min_width, grouping, thousands_sep);
8255 case PyUnicode_4BYTE_KIND:
8256 return _PyUnicode_ucs4_InsertThousandsGrouping(
8257 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8258 min_width, grouping, thousands_sep);
8259 }
8260 assert(0);
8261 return -1;
8262}
8263
8264
Eric Smith8c663262007-08-25 02:26:07 +00008265#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008266#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008267
Thomas Wouters477c8d52006-05-27 19:21:47 +00008268#include "stringlib/count.h"
8269#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008270
Thomas Wouters477c8d52006-05-27 19:21:47 +00008271/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008272#define ADJUST_INDICES(start, end, len) \
8273 if (end > len) \
8274 end = len; \
8275 else if (end < 0) { \
8276 end += len; \
8277 if (end < 0) \
8278 end = 0; \
8279 } \
8280 if (start < 0) { \
8281 start += len; \
8282 if (start < 0) \
8283 start = 0; \
8284 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008285
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286Py_ssize_t
8287PyUnicode_Count(PyObject *str,
8288 PyObject *substr,
8289 Py_ssize_t start,
8290 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008292 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008293 PyUnicodeObject* str_obj;
8294 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 int kind1, kind2, kind;
8296 void *buf1 = NULL, *buf2 = NULL;
8297 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008298
Thomas Wouters477c8d52006-05-27 19:21:47 +00008299 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008302 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008303 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 Py_DECREF(str_obj);
8305 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 }
Tim Petersced69f82003-09-16 20:30:58 +00008307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 kind1 = PyUnicode_KIND(str_obj);
8309 kind2 = PyUnicode_KIND(sub_obj);
8310 kind = kind1 > kind2 ? kind1 : kind2;
8311 buf1 = PyUnicode_DATA(str_obj);
8312 if (kind1 != kind)
8313 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8314 if (!buf1)
8315 goto onError;
8316 buf2 = PyUnicode_DATA(sub_obj);
8317 if (kind2 != kind)
8318 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8319 if (!buf2)
8320 goto onError;
8321 len1 = PyUnicode_GET_LENGTH(str_obj);
8322 len2 = PyUnicode_GET_LENGTH(sub_obj);
8323
8324 ADJUST_INDICES(start, end, len1);
8325 switch(kind) {
8326 case PyUnicode_1BYTE_KIND:
8327 result = ucs1lib_count(
8328 ((Py_UCS1*)buf1) + start, end - start,
8329 buf2, len2, PY_SSIZE_T_MAX
8330 );
8331 break;
8332 case PyUnicode_2BYTE_KIND:
8333 result = ucs2lib_count(
8334 ((Py_UCS2*)buf1) + start, end - start,
8335 buf2, len2, PY_SSIZE_T_MAX
8336 );
8337 break;
8338 case PyUnicode_4BYTE_KIND:
8339 result = ucs4lib_count(
8340 ((Py_UCS4*)buf1) + start, end - start,
8341 buf2, len2, PY_SSIZE_T_MAX
8342 );
8343 break;
8344 default:
8345 assert(0); result = 0;
8346 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008347
8348 Py_DECREF(sub_obj);
8349 Py_DECREF(str_obj);
8350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 if (kind1 != kind)
8352 PyMem_Free(buf1);
8353 if (kind2 != kind)
8354 PyMem_Free(buf2);
8355
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 onError:
8358 Py_DECREF(sub_obj);
8359 Py_DECREF(str_obj);
8360 if (kind1 != kind && buf1)
8361 PyMem_Free(buf1);
8362 if (kind2 != kind && buf2)
8363 PyMem_Free(buf2);
8364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367Py_ssize_t
8368PyUnicode_Find(PyObject *str,
8369 PyObject *sub,
8370 Py_ssize_t start,
8371 Py_ssize_t end,
8372 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008374 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008375
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 Py_DECREF(str);
8382 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
Tim Petersced69f82003-09-16 20:30:58 +00008384
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 result = any_find_slice(
8387 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8388 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008389 );
8390 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 result = any_find_slice(
8392 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8393 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394 );
8395
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008397 Py_DECREF(sub);
8398
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 return result;
8400}
8401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402Py_ssize_t
8403PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8404 Py_ssize_t start, Py_ssize_t end,
8405 int direction)
8406{
8407 char *result;
8408 int kind;
8409 if (PyUnicode_READY(str) == -1)
8410 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008411 if (start < 0 || end < 0) {
8412 PyErr_SetString(PyExc_IndexError, "string index out of range");
8413 return -2;
8414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 if (end > PyUnicode_GET_LENGTH(str))
8416 end = PyUnicode_GET_LENGTH(str);
8417 kind = PyUnicode_KIND(str);
8418 result = findchar(PyUnicode_1BYTE_DATA(str)
8419 + PyUnicode_KIND_SIZE(kind, start),
8420 kind,
8421 end-start, ch, direction);
8422 if (!result)
8423 return -1;
8424 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8425}
8426
Alexander Belopolsky40018472011-02-26 01:02:56 +00008427static int
8428tailmatch(PyUnicodeObject *self,
8429 PyUnicodeObject *substring,
8430 Py_ssize_t start,
8431 Py_ssize_t end,
8432 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 int kind_self;
8435 int kind_sub;
8436 void *data_self;
8437 void *data_sub;
8438 Py_ssize_t offset;
8439 Py_ssize_t i;
8440 Py_ssize_t end_sub;
8441
8442 if (PyUnicode_READY(self) == -1 ||
8443 PyUnicode_READY(substring) == -1)
8444 return 0;
8445
8446 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 return 1;
8448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8450 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 kind_self = PyUnicode_KIND(self);
8455 data_self = PyUnicode_DATA(self);
8456 kind_sub = PyUnicode_KIND(substring);
8457 data_sub = PyUnicode_DATA(substring);
8458 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8459
8460 if (direction > 0)
8461 offset = end;
8462 else
8463 offset = start;
8464
8465 if (PyUnicode_READ(kind_self, data_self, offset) ==
8466 PyUnicode_READ(kind_sub, data_sub, 0) &&
8467 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8468 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8469 /* If both are of the same kind, memcmp is sufficient */
8470 if (kind_self == kind_sub) {
8471 return ! memcmp((char *)data_self +
8472 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8473 data_sub,
8474 PyUnicode_GET_LENGTH(substring) *
8475 PyUnicode_CHARACTER_SIZE(substring));
8476 }
8477 /* otherwise we have to compare each character by first accesing it */
8478 else {
8479 /* We do not need to compare 0 and len(substring)-1 because
8480 the if statement above ensured already that they are equal
8481 when we end up here. */
8482 // TODO: honor direction and do a forward or backwards search
8483 for (i = 1; i < end_sub; ++i) {
8484 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8485 PyUnicode_READ(kind_sub, data_sub, i))
8486 return 0;
8487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 }
8491
8492 return 0;
8493}
8494
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495Py_ssize_t
8496PyUnicode_Tailmatch(PyObject *str,
8497 PyObject *substr,
8498 Py_ssize_t start,
8499 Py_ssize_t end,
8500 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008502 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008503
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 str = PyUnicode_FromObject(str);
8505 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 substr = PyUnicode_FromObject(substr);
8508 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 Py_DECREF(str);
8510 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 }
Tim Petersced69f82003-09-16 20:30:58 +00008512
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 (PyUnicodeObject *)substr,
8515 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 Py_DECREF(str);
8517 Py_DECREF(substr);
8518 return result;
8519}
8520
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521/* Apply fixfct filter to the Unicode object self and return a
8522 reference to the modified object */
8523
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524static PyObject *
8525fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 PyObject *u;
8529 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 if (PyUnicode_READY(self) == -1)
8532 return NULL;
8533 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8534 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8535 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8540 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 /* fix functions return the new maximum character in a string,
8543 if the kind of the resulting unicode object does not change,
8544 everything is fine. Otherwise we need to change the string kind
8545 and re-run the fix function. */
8546 maxchar_new = fixfct((PyUnicodeObject*)u);
8547 if (maxchar_new == 0)
8548 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8549 else if (maxchar_new <= 127)
8550 maxchar_new = 127;
8551 else if (maxchar_new <= 255)
8552 maxchar_new = 255;
8553 else if (maxchar_new <= 65535)
8554 maxchar_new = 65535;
8555 else
8556 maxchar_new = 1114111; /* 0x10ffff */
8557
8558 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 /* fixfct should return TRUE if it modified the buffer. If
8560 FALSE, return a reference to the original buffer instead
8561 (to save space, not time) */
8562 Py_INCREF(self);
8563 Py_DECREF(u);
8564 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 else if (maxchar_new == maxchar_old) {
8567 return u;
8568 }
8569 else {
8570 /* In case the maximum character changed, we need to
8571 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008572 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 if (v == NULL) {
8574 Py_DECREF(u);
8575 return NULL;
8576 }
8577 if (maxchar_new > maxchar_old) {
8578 /* If the maxchar increased so that the kind changed, not all
8579 characters are representable anymore and we need to fix the
8580 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008581 if (PyUnicode_CopyCharacters(v, 0,
8582 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008583 PyUnicode_GET_LENGTH(self)) < 0)
8584 {
8585 Py_DECREF(u);
8586 return NULL;
8587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 maxchar_old = fixfct((PyUnicodeObject*)v);
8589 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8590 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008591 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008592 if (PyUnicode_CopyCharacters(v, 0,
8593 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008594 PyUnicode_GET_LENGTH(self)) < 0)
8595 {
8596 Py_DECREF(u);
8597 return NULL;
8598 }
8599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600
8601 Py_DECREF(u);
8602 return v;
8603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008607fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 /* No need to call PyUnicode_READY(self) because this function is only
8610 called as a callback from fixup() which does it already. */
8611 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8612 const int kind = PyUnicode_KIND(self);
8613 void *data = PyUnicode_DATA(self);
8614 int touched = 0;
8615 Py_UCS4 maxchar = 0;
8616 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 for (i = 0; i < len; ++i) {
8619 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8620 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8621 if (up != ch) {
8622 if (up > maxchar)
8623 maxchar = up;
8624 PyUnicode_WRITE(kind, data, i, up);
8625 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 else if (ch > maxchar)
8628 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
8630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 if (touched)
8632 return maxchar;
8633 else
8634 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635}
8636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008638fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8641 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8642 const int kind = PyUnicode_KIND(self);
8643 void *data = PyUnicode_DATA(self);
8644 int touched = 0;
8645 Py_UCS4 maxchar = 0;
8646 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 for(i = 0; i < len; ++i) {
8649 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8650 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8651 if (lo != ch) {
8652 if (lo > maxchar)
8653 maxchar = lo;
8654 PyUnicode_WRITE(kind, data, i, lo);
8655 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 else if (ch > maxchar)
8658 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 }
8660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 if (touched)
8662 return maxchar;
8663 else
8664 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665}
8666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008668fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8671 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8672 const int kind = PyUnicode_KIND(self);
8673 void *data = PyUnicode_DATA(self);
8674 int touched = 0;
8675 Py_UCS4 maxchar = 0;
8676 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 for(i = 0; i < len; ++i) {
8679 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8680 Py_UCS4 nu = 0;
8681
8682 if (Py_UNICODE_ISUPPER(ch))
8683 nu = Py_UNICODE_TOLOWER(ch);
8684 else if (Py_UNICODE_ISLOWER(ch))
8685 nu = Py_UNICODE_TOUPPER(ch);
8686
8687 if (nu != 0) {
8688 if (nu > maxchar)
8689 maxchar = nu;
8690 PyUnicode_WRITE(kind, data, i, nu);
8691 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 else if (ch > maxchar)
8694 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 }
8696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 if (touched)
8698 return maxchar;
8699 else
8700 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008704fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8707 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8708 const int kind = PyUnicode_KIND(self);
8709 void *data = PyUnicode_DATA(self);
8710 int touched = 0;
8711 Py_UCS4 maxchar = 0;
8712 Py_ssize_t i = 0;
8713 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008714
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008715 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717
8718 ch = PyUnicode_READ(kind, data, i);
8719 if (!Py_UNICODE_ISUPPER(ch)) {
8720 maxchar = Py_UNICODE_TOUPPER(ch);
8721 PyUnicode_WRITE(kind, data, i, maxchar);
8722 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 ++i;
8725 for(; i < len; ++i) {
8726 ch = PyUnicode_READ(kind, data, i);
8727 if (!Py_UNICODE_ISLOWER(ch)) {
8728 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8729 if (lo > maxchar)
8730 maxchar = lo;
8731 PyUnicode_WRITE(kind, data, i, lo);
8732 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 else if (ch > maxchar)
8735 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737
8738 if (touched)
8739 return maxchar;
8740 else
8741 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742}
8743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8748 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8749 const int kind = PyUnicode_KIND(self);
8750 void *data = PyUnicode_DATA(self);
8751 Py_UCS4 maxchar = 0;
8752 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 int previous_is_cased;
8754
8755 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 if (len == 1) {
8757 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8758 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8759 if (ti != ch) {
8760 PyUnicode_WRITE(kind, data, i, ti);
8761 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 }
8763 else
8764 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 for(; i < len; ++i) {
8768 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8769 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008770
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 nu = Py_UNICODE_TOTITLE(ch);
8775
8776 if (nu > maxchar)
8777 maxchar = nu;
8778 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008779
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 if (Py_UNICODE_ISLOWER(ch) ||
8781 Py_UNICODE_ISUPPER(ch) ||
8782 Py_UNICODE_ISTITLE(ch))
8783 previous_is_cased = 1;
8784 else
8785 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788}
8789
Tim Peters8ce9f162004-08-27 01:49:32 +00008790PyObject *
8791PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008794 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008796 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008797 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8798 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008799 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 Py_ssize_t sz, i, res_offset;
8801 Py_UCS4 maxchar = 0;
8802 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803
Tim Peters05eba1f2004-08-27 21:32:02 +00008804 fseq = PySequence_Fast(seq, "");
8805 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008806 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008807 }
8808
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008809 /* NOTE: the following code can't call back into Python code,
8810 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008811 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008812
Tim Peters05eba1f2004-08-27 21:32:02 +00008813 seqlen = PySequence_Fast_GET_SIZE(fseq);
8814 /* If empty sequence, return u"". */
8815 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008818 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008819 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008820 /* If singleton sequence with an exact Unicode, return that. */
8821 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 item = items[0];
8823 if (PyUnicode_CheckExact(item)) {
8824 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 goto Done;
8827 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008828 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008829 else {
8830 /* Set up sep and seplen */
8831 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 /* fall back to a blank space separator */
8833 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008834 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008836 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008837 else {
8838 if (!PyUnicode_Check(separator)) {
8839 PyErr_Format(PyExc_TypeError,
8840 "separator: expected str instance,"
8841 " %.80s found",
8842 Py_TYPE(separator)->tp_name);
8843 goto onError;
8844 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008845 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 goto onError;
8847 sep = separator;
8848 seplen = PyUnicode_GET_LENGTH(separator);
8849 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8850 /* inc refcount to keep this code path symetric with the
8851 above case of a blank separator */
8852 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008853 }
8854 }
8855
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008856 /* There are at least two things to join, or else we have a subclass
8857 * of str in the sequence.
8858 * Do a pre-pass to figure out the total amount of space we'll
8859 * need (sz), and see whether all argument are strings.
8860 */
8861 sz = 0;
8862 for (i = 0; i < seqlen; i++) {
8863 const Py_ssize_t old_sz = sz;
8864 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 if (!PyUnicode_Check(item)) {
8866 PyErr_Format(PyExc_TypeError,
8867 "sequence item %zd: expected str instance,"
8868 " %.80s found",
8869 i, Py_TYPE(item)->tp_name);
8870 goto onError;
8871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 if (PyUnicode_READY(item) == -1)
8873 goto onError;
8874 sz += PyUnicode_GET_LENGTH(item);
8875 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8876 if (item_maxchar > maxchar)
8877 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008878 if (i != 0)
8879 sz += seplen;
8880 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8881 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008883 goto onError;
8884 }
8885 }
Tim Petersced69f82003-09-16 20:30:58 +00008886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008888 if (res == NULL)
8889 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008890
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008891 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008893 Py_ssize_t itemlen;
8894 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 /* Copy item, and maybe the separator. */
8897 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008898 if (PyUnicode_CopyCharacters(res, res_offset,
8899 sep, 0, seplen) < 0)
8900 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008903 if (PyUnicode_CopyCharacters(res, res_offset,
8904 item, 0, itemlen) < 0)
8905 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008909
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008911 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912 Py_XDECREF(sep);
8913 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008916 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008918 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919 return NULL;
8920}
8921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922#define FILL(kind, data, value, start, length) \
8923 do { \
8924 Py_ssize_t i_ = 0; \
8925 assert(kind != PyUnicode_WCHAR_KIND); \
8926 switch ((kind)) { \
8927 case PyUnicode_1BYTE_KIND: { \
8928 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8929 memset(to_, (unsigned char)value, length); \
8930 break; \
8931 } \
8932 case PyUnicode_2BYTE_KIND: { \
8933 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8934 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8935 break; \
8936 } \
8937 default: { \
8938 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8939 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8940 break; \
8941 } \
8942 } \
8943 } while (0)
8944
Alexander Belopolsky40018472011-02-26 01:02:56 +00008945static PyUnicodeObject *
8946pad(PyUnicodeObject *self,
8947 Py_ssize_t left,
8948 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 PyObject *u;
8952 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008953 int kind;
8954 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
8956 if (left < 0)
8957 left = 0;
8958 if (right < 0)
8959 right = 0;
8960
Tim Peters7a29bd52001-09-12 03:03:31 +00008961 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 Py_INCREF(self);
8963 return self;
8964 }
8965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8967 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008968 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8969 return NULL;
8970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8972 if (fill > maxchar)
8973 maxchar = fill;
8974 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008975 if (!u)
8976 return NULL;
8977
8978 kind = PyUnicode_KIND(u);
8979 data = PyUnicode_DATA(u);
8980 if (left)
8981 FILL(kind, data, fill, 0, left);
8982 if (right)
8983 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008984 if (PyUnicode_CopyCharacters(u, left,
8985 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008986 _PyUnicode_LENGTH(self)) < 0)
8987 {
8988 Py_DECREF(u);
8989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 }
8991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995
Alexander Belopolsky40018472011-02-26 01:02:56 +00008996PyObject *
8997PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000
9001 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009002 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 switch(PyUnicode_KIND(string)) {
9006 case PyUnicode_1BYTE_KIND:
9007 list = ucs1lib_splitlines(
9008 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9009 PyUnicode_GET_LENGTH(string), keepends);
9010 break;
9011 case PyUnicode_2BYTE_KIND:
9012 list = ucs2lib_splitlines(
9013 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9014 PyUnicode_GET_LENGTH(string), keepends);
9015 break;
9016 case PyUnicode_4BYTE_KIND:
9017 list = ucs4lib_splitlines(
9018 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9019 PyUnicode_GET_LENGTH(string), keepends);
9020 break;
9021 default:
9022 assert(0);
9023 list = 0;
9024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 Py_DECREF(string);
9026 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027}
9028
Alexander Belopolsky40018472011-02-26 01:02:56 +00009029static PyObject *
9030split(PyUnicodeObject *self,
9031 PyUnicodeObject *substring,
9032 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 int kind1, kind2, kind;
9035 void *buf1, *buf2;
9036 Py_ssize_t len1, len2;
9037 PyObject* out;
9038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009040 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 if (PyUnicode_READY(self) == -1)
9043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 if (substring == NULL)
9046 switch(PyUnicode_KIND(self)) {
9047 case PyUnicode_1BYTE_KIND:
9048 return ucs1lib_split_whitespace(
9049 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9050 PyUnicode_GET_LENGTH(self), maxcount
9051 );
9052 case PyUnicode_2BYTE_KIND:
9053 return ucs2lib_split_whitespace(
9054 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9055 PyUnicode_GET_LENGTH(self), maxcount
9056 );
9057 case PyUnicode_4BYTE_KIND:
9058 return ucs4lib_split_whitespace(
9059 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9060 PyUnicode_GET_LENGTH(self), maxcount
9061 );
9062 default:
9063 assert(0);
9064 return NULL;
9065 }
9066
9067 if (PyUnicode_READY(substring) == -1)
9068 return NULL;
9069
9070 kind1 = PyUnicode_KIND(self);
9071 kind2 = PyUnicode_KIND(substring);
9072 kind = kind1 > kind2 ? kind1 : kind2;
9073 buf1 = PyUnicode_DATA(self);
9074 buf2 = PyUnicode_DATA(substring);
9075 if (kind1 != kind)
9076 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9077 if (!buf1)
9078 return NULL;
9079 if (kind2 != kind)
9080 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9081 if (!buf2) {
9082 if (kind1 != kind) PyMem_Free(buf1);
9083 return NULL;
9084 }
9085 len1 = PyUnicode_GET_LENGTH(self);
9086 len2 = PyUnicode_GET_LENGTH(substring);
9087
9088 switch(kind) {
9089 case PyUnicode_1BYTE_KIND:
9090 out = ucs1lib_split(
9091 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9092 break;
9093 case PyUnicode_2BYTE_KIND:
9094 out = ucs2lib_split(
9095 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9096 break;
9097 case PyUnicode_4BYTE_KIND:
9098 out = ucs4lib_split(
9099 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9100 break;
9101 default:
9102 out = NULL;
9103 }
9104 if (kind1 != kind)
9105 PyMem_Free(buf1);
9106 if (kind2 != kind)
9107 PyMem_Free(buf2);
9108 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109}
9110
Alexander Belopolsky40018472011-02-26 01:02:56 +00009111static PyObject *
9112rsplit(PyUnicodeObject *self,
9113 PyUnicodeObject *substring,
9114 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009115{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 int kind1, kind2, kind;
9117 void *buf1, *buf2;
9118 Py_ssize_t len1, len2;
9119 PyObject* out;
9120
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009121 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009122 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 if (PyUnicode_READY(self) == -1)
9125 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 if (substring == NULL)
9128 switch(PyUnicode_KIND(self)) {
9129 case PyUnicode_1BYTE_KIND:
9130 return ucs1lib_rsplit_whitespace(
9131 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9132 PyUnicode_GET_LENGTH(self), maxcount
9133 );
9134 case PyUnicode_2BYTE_KIND:
9135 return ucs2lib_rsplit_whitespace(
9136 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9137 PyUnicode_GET_LENGTH(self), maxcount
9138 );
9139 case PyUnicode_4BYTE_KIND:
9140 return ucs4lib_rsplit_whitespace(
9141 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9142 PyUnicode_GET_LENGTH(self), maxcount
9143 );
9144 default:
9145 assert(0);
9146 return NULL;
9147 }
9148
9149 if (PyUnicode_READY(substring) == -1)
9150 return NULL;
9151
9152 kind1 = PyUnicode_KIND(self);
9153 kind2 = PyUnicode_KIND(substring);
9154 kind = kind1 > kind2 ? kind1 : kind2;
9155 buf1 = PyUnicode_DATA(self);
9156 buf2 = PyUnicode_DATA(substring);
9157 if (kind1 != kind)
9158 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9159 if (!buf1)
9160 return NULL;
9161 if (kind2 != kind)
9162 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9163 if (!buf2) {
9164 if (kind1 != kind) PyMem_Free(buf1);
9165 return NULL;
9166 }
9167 len1 = PyUnicode_GET_LENGTH(self);
9168 len2 = PyUnicode_GET_LENGTH(substring);
9169
9170 switch(kind) {
9171 case PyUnicode_1BYTE_KIND:
9172 out = ucs1lib_rsplit(
9173 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9174 break;
9175 case PyUnicode_2BYTE_KIND:
9176 out = ucs2lib_rsplit(
9177 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9178 break;
9179 case PyUnicode_4BYTE_KIND:
9180 out = ucs4lib_rsplit(
9181 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9182 break;
9183 default:
9184 out = NULL;
9185 }
9186 if (kind1 != kind)
9187 PyMem_Free(buf1);
9188 if (kind2 != kind)
9189 PyMem_Free(buf2);
9190 return out;
9191}
9192
9193static Py_ssize_t
9194anylib_find(int kind, void *buf1, Py_ssize_t len1,
9195 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9196{
9197 switch(kind) {
9198 case PyUnicode_1BYTE_KIND:
9199 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9200 case PyUnicode_2BYTE_KIND:
9201 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9202 case PyUnicode_4BYTE_KIND:
9203 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9204 }
9205 assert(0);
9206 return -1;
9207}
9208
9209static Py_ssize_t
9210anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9211 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9212{
9213 switch(kind) {
9214 case PyUnicode_1BYTE_KIND:
9215 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9216 case PyUnicode_2BYTE_KIND:
9217 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9218 case PyUnicode_4BYTE_KIND:
9219 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9220 }
9221 assert(0);
9222 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009223}
9224
Alexander Belopolsky40018472011-02-26 01:02:56 +00009225static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226replace(PyObject *self, PyObject *str1,
9227 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 PyObject *u;
9230 char *sbuf = PyUnicode_DATA(self);
9231 char *buf1 = PyUnicode_DATA(str1);
9232 char *buf2 = PyUnicode_DATA(str2);
9233 int srelease = 0, release1 = 0, release2 = 0;
9234 int skind = PyUnicode_KIND(self);
9235 int kind1 = PyUnicode_KIND(str1);
9236 int kind2 = PyUnicode_KIND(str2);
9237 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9238 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9239 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240
9241 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009242 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009244 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 if (skind < kind1)
9247 /* substring too wide to be present */
9248 goto nothing;
9249
9250 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009251 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009252 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009254 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009256 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 Py_UCS4 u1, u2, maxchar;
9258 int mayshrink, rkind;
9259 u1 = PyUnicode_READ_CHAR(str1, 0);
9260 if (!findchar(sbuf, PyUnicode_KIND(self),
9261 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009262 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 u2 = PyUnicode_READ_CHAR(str2, 0);
9264 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9265 /* Replacing u1 with u2 may cause a maxchar reduction in the
9266 result string. */
9267 mayshrink = maxchar > 127;
9268 if (u2 > maxchar) {
9269 maxchar = u2;
9270 mayshrink = 0;
9271 }
9272 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009273 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009275 if (PyUnicode_CopyCharacters(u, 0,
9276 (PyObject*)self, 0, slen) < 0)
9277 {
9278 Py_DECREF(u);
9279 return NULL;
9280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 rkind = PyUnicode_KIND(u);
9282 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9283 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009284 if (--maxcount < 0)
9285 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 if (mayshrink) {
9289 PyObject *tmp = u;
9290 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9291 PyUnicode_GET_LENGTH(tmp));
9292 Py_DECREF(tmp);
9293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 int rkind = skind;
9296 char *res;
9297 if (kind1 < rkind) {
9298 /* widen substring */
9299 buf1 = _PyUnicode_AsKind(str1, rkind);
9300 if (!buf1) goto error;
9301 release1 = 1;
9302 }
9303 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009304 if (i < 0)
9305 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (rkind > kind2) {
9307 /* widen replacement */
9308 buf2 = _PyUnicode_AsKind(str2, rkind);
9309 if (!buf2) goto error;
9310 release2 = 1;
9311 }
9312 else if (rkind < kind2) {
9313 /* widen self and buf1 */
9314 rkind = kind2;
9315 if (release1) PyMem_Free(buf1);
9316 sbuf = _PyUnicode_AsKind(self, rkind);
9317 if (!sbuf) goto error;
9318 srelease = 1;
9319 buf1 = _PyUnicode_AsKind(str1, rkind);
9320 if (!buf1) goto error;
9321 release1 = 1;
9322 }
9323 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9324 if (!res) {
9325 PyErr_NoMemory();
9326 goto error;
9327 }
9328 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009329 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9331 buf2,
9332 PyUnicode_KIND_SIZE(rkind, len2));
9333 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009334
9335 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9337 slen-i,
9338 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009339 if (i == -1)
9340 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9342 buf2,
9343 PyUnicode_KIND_SIZE(rkind, len2));
9344 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346
9347 u = PyUnicode_FromKindAndData(rkind, res, slen);
9348 PyMem_Free(res);
9349 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 Py_ssize_t n, i, j, ires;
9354 Py_ssize_t product, new_size;
9355 int rkind = skind;
9356 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358 if (kind1 < rkind) {
9359 buf1 = _PyUnicode_AsKind(str1, rkind);
9360 if (!buf1) goto error;
9361 release1 = 1;
9362 }
9363 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364 if (n == 0)
9365 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (kind2 < rkind) {
9367 buf2 = _PyUnicode_AsKind(str2, rkind);
9368 if (!buf2) goto error;
9369 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 else if (kind2 > rkind) {
9372 rkind = kind2;
9373 sbuf = _PyUnicode_AsKind(self, rkind);
9374 if (!sbuf) goto error;
9375 srelease = 1;
9376 if (release1) PyMem_Free(buf1);
9377 buf1 = _PyUnicode_AsKind(str1, rkind);
9378 if (!buf1) goto error;
9379 release1 = 1;
9380 }
9381 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9382 PyUnicode_GET_LENGTH(str1))); */
9383 product = n * (len2-len1);
9384 if ((product / (len2-len1)) != n) {
9385 PyErr_SetString(PyExc_OverflowError,
9386 "replace string is too long");
9387 goto error;
9388 }
9389 new_size = slen + product;
9390 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9391 PyErr_SetString(PyExc_OverflowError,
9392 "replace string is too long");
9393 goto error;
9394 }
9395 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9396 if (!res)
9397 goto error;
9398 ires = i = 0;
9399 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009400 while (n-- > 0) {
9401 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 j = anylib_find(rkind,
9403 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9404 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009405 if (j == -1)
9406 break;
9407 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009408 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9410 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9411 PyUnicode_KIND_SIZE(rkind, j-i));
9412 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009413 }
9414 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 if (len2 > 0) {
9416 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9417 buf2,
9418 PyUnicode_KIND_SIZE(rkind, len2));
9419 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009424 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9426 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9427 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009428 } else {
9429 /* interleave */
9430 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9432 buf2,
9433 PyUnicode_KIND_SIZE(rkind, len2));
9434 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009435 if (--n <= 0)
9436 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9438 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9439 PyUnicode_KIND_SIZE(rkind, 1));
9440 ires++;
9441 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9444 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9445 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009448 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 if (srelease)
9451 PyMem_FREE(sbuf);
9452 if (release1)
9453 PyMem_FREE(buf1);
9454 if (release2)
9455 PyMem_FREE(buf2);
9456 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009457
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009459 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 if (srelease)
9461 PyMem_FREE(sbuf);
9462 if (release1)
9463 PyMem_FREE(buf1);
9464 if (release2)
9465 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009466 if (PyUnicode_CheckExact(self)) {
9467 Py_INCREF(self);
9468 return (PyObject *) self;
9469 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009470 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 error:
9472 if (srelease && sbuf)
9473 PyMem_FREE(sbuf);
9474 if (release1 && buf1)
9475 PyMem_FREE(buf1);
9476 if (release2 && buf2)
9477 PyMem_FREE(buf2);
9478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
9481/* --- Unicode Object Methods --------------------------------------------- */
9482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009483PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485\n\
9486Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009487characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488
9489static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009490unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 return fixup(self, fixtitle);
9493}
9494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009495PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497\n\
9498Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009499have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500
9501static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009502unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 return fixup(self, fixcapitalize);
9505}
9506
9507#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009508PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510\n\
9511Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009512normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513
9514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009515unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516{
9517 PyObject *list;
9518 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009519 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521 /* Split into words */
9522 list = split(self, NULL, -1);
9523 if (!list)
9524 return NULL;
9525
9526 /* Capitalize each word */
9527 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9528 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 if (item == NULL)
9531 goto onError;
9532 Py_DECREF(PyList_GET_ITEM(list, i));
9533 PyList_SET_ITEM(list, i, item);
9534 }
9535
9536 /* Join the words to form a new string */
9537 item = PyUnicode_Join(NULL, list);
9538
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 Py_DECREF(list);
9541 return (PyObject *)item;
9542}
9543#endif
9544
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009545/* Argument converter. Coerces to a single unicode character */
9546
9547static int
9548convert_uc(PyObject *obj, void *addr)
9549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009551 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009552
Benjamin Peterson14339b62009-01-31 16:36:08 +00009553 uniobj = PyUnicode_FromObject(obj);
9554 if (uniobj == NULL) {
9555 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009557 return 0;
9558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009560 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009562 Py_DECREF(uniobj);
9563 return 0;
9564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009566 Py_DECREF(uniobj);
9567 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009568}
9569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009570PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009573Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009574done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575
9576static PyObject *
9577unicode_center(PyUnicodeObject *self, PyObject *args)
9578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009579 Py_ssize_t marg, left;
9580 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 Py_UCS4 fillchar = ' ';
9582
Victor Stinnere9a29352011-10-01 02:14:59 +02009583 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
Victor Stinnere9a29352011-10-01 02:14:59 +02009586 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 return NULL;
9588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590 Py_INCREF(self);
9591 return (PyObject*) self;
9592 }
9593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 left = marg / 2 + (marg & width & 1);
9596
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009597 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598}
9599
Marc-André Lemburge5034372000-08-08 08:04:29 +00009600#if 0
9601
9602/* This code should go into some future Unicode collation support
9603 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009604 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009605
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009606/* speedy UTF-16 code point order comparison */
9607/* gleaned from: */
9608/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9609
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009610static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009611{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009612 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009613 0, 0, 0, 0, 0, 0, 0, 0,
9614 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009615 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009616};
9617
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618static int
9619unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009621 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009622
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 Py_UNICODE *s1 = str1->str;
9624 Py_UNICODE *s2 = str2->str;
9625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 len1 = str1->_base._base.length;
9627 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009628
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009630 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009631
9632 c1 = *s1++;
9633 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009634
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 if (c1 > (1<<11) * 26)
9636 c1 += utf16Fixup[c1>>11];
9637 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009638 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009639 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009640
9641 if (c1 != c2)
9642 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009643
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009644 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645 }
9646
9647 return (len1 < len2) ? -1 : (len1 != len2);
9648}
9649
Marc-André Lemburge5034372000-08-08 08:04:29 +00009650#else
9651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652/* This function assumes that str1 and str2 are readied by the caller. */
9653
Marc-André Lemburge5034372000-08-08 08:04:29 +00009654static int
9655unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 int kind1, kind2;
9658 void *data1, *data2;
9659 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 kind1 = PyUnicode_KIND(str1);
9662 kind2 = PyUnicode_KIND(str2);
9663 data1 = PyUnicode_DATA(str1);
9664 data2 = PyUnicode_DATA(str2);
9665 len1 = PyUnicode_GET_LENGTH(str1);
9666 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 for (i = 0; i < len1 && i < len2; ++i) {
9669 Py_UCS4 c1, c2;
9670 c1 = PyUnicode_READ(kind1, data1, i);
9671 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009672
9673 if (c1 != c2)
9674 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009675 }
9676
9677 return (len1 < len2) ? -1 : (len1 != len2);
9678}
9679
9680#endif
9681
Alexander Belopolsky40018472011-02-26 01:02:56 +00009682int
9683PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9686 if (PyUnicode_READY(left) == -1 ||
9687 PyUnicode_READY(right) == -1)
9688 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009689 return unicode_compare((PyUnicodeObject *)left,
9690 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009692 PyErr_Format(PyExc_TypeError,
9693 "Can't compare %.100s and %.100s",
9694 left->ob_type->tp_name,
9695 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696 return -1;
9697}
9698
Martin v. Löwis5b222132007-06-10 09:51:05 +00009699int
9700PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 Py_ssize_t i;
9703 int kind;
9704 void *data;
9705 Py_UCS4 chr;
9706
Victor Stinner910337b2011-10-03 03:20:16 +02009707 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 if (PyUnicode_READY(uni) == -1)
9709 return -1;
9710 kind = PyUnicode_KIND(uni);
9711 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009712 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9714 if (chr != str[i])
9715 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009716 /* This check keeps Python strings that end in '\0' from comparing equal
9717 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009719 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009720 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009721 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009722 return 0;
9723}
9724
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009725
Benjamin Peterson29060642009-01-31 22:14:21 +00009726#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009727 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009728
Alexander Belopolsky40018472011-02-26 01:02:56 +00009729PyObject *
9730PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009731{
9732 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009734 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9735 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 if (PyUnicode_READY(left) == -1 ||
9737 PyUnicode_READY(right) == -1)
9738 return NULL;
9739 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9740 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009741 if (op == Py_EQ) {
9742 Py_INCREF(Py_False);
9743 return Py_False;
9744 }
9745 if (op == Py_NE) {
9746 Py_INCREF(Py_True);
9747 return Py_True;
9748 }
9749 }
9750 if (left == right)
9751 result = 0;
9752 else
9753 result = unicode_compare((PyUnicodeObject *)left,
9754 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009755
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009756 /* Convert the return value to a Boolean */
9757 switch (op) {
9758 case Py_EQ:
9759 v = TEST_COND(result == 0);
9760 break;
9761 case Py_NE:
9762 v = TEST_COND(result != 0);
9763 break;
9764 case Py_LE:
9765 v = TEST_COND(result <= 0);
9766 break;
9767 case Py_GE:
9768 v = TEST_COND(result >= 0);
9769 break;
9770 case Py_LT:
9771 v = TEST_COND(result == -1);
9772 break;
9773 case Py_GT:
9774 v = TEST_COND(result == 1);
9775 break;
9776 default:
9777 PyErr_BadArgument();
9778 return NULL;
9779 }
9780 Py_INCREF(v);
9781 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009783
Brian Curtindfc80e32011-08-10 20:28:54 -05009784 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009785}
9786
Alexander Belopolsky40018472011-02-26 01:02:56 +00009787int
9788PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009789{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009790 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 int kind1, kind2, kind;
9792 void *buf1, *buf2;
9793 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009794 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009795
9796 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009797 sub = PyUnicode_FromObject(element);
9798 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009799 PyErr_Format(PyExc_TypeError,
9800 "'in <string>' requires string as left operand, not %s",
9801 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009802 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 if (PyUnicode_READY(sub) == -1)
9805 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009806
Thomas Wouters477c8d52006-05-27 19:21:47 +00009807 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009808 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009809 Py_DECREF(sub);
9810 return -1;
9811 }
9812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 kind1 = PyUnicode_KIND(str);
9814 kind2 = PyUnicode_KIND(sub);
9815 kind = kind1 > kind2 ? kind1 : kind2;
9816 buf1 = PyUnicode_DATA(str);
9817 buf2 = PyUnicode_DATA(sub);
9818 if (kind1 != kind)
9819 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9820 if (!buf1) {
9821 Py_DECREF(sub);
9822 return -1;
9823 }
9824 if (kind2 != kind)
9825 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9826 if (!buf2) {
9827 Py_DECREF(sub);
9828 if (kind1 != kind) PyMem_Free(buf1);
9829 return -1;
9830 }
9831 len1 = PyUnicode_GET_LENGTH(str);
9832 len2 = PyUnicode_GET_LENGTH(sub);
9833
9834 switch(kind) {
9835 case PyUnicode_1BYTE_KIND:
9836 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9837 break;
9838 case PyUnicode_2BYTE_KIND:
9839 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9840 break;
9841 case PyUnicode_4BYTE_KIND:
9842 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9843 break;
9844 default:
9845 result = -1;
9846 assert(0);
9847 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009848
9849 Py_DECREF(str);
9850 Py_DECREF(sub);
9851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (kind1 != kind)
9853 PyMem_Free(buf1);
9854 if (kind2 != kind)
9855 PyMem_Free(buf2);
9856
Guido van Rossum403d68b2000-03-13 15:55:09 +00009857 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009858}
9859
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860/* Concat to string or Unicode object giving a new Unicode object. */
9861
Alexander Belopolsky40018472011-02-26 01:02:56 +00009862PyObject *
9863PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 PyObject *u = NULL, *v = NULL, *w;
9866 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867
9868 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009870 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009874 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875
9876 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009877 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009878 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009881 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009882 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884 }
9885
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009887 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 w = PyUnicode_New(
9891 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9892 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009894 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009895 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9896 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009897 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009898 v, 0,
9899 PyUnicode_GET_LENGTH(v)) < 0)
9900 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 Py_DECREF(u);
9902 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 Py_XDECREF(u);
9907 Py_XDECREF(v);
9908 return NULL;
9909}
9910
Walter Dörwald1ab83302007-05-18 17:15:44 +00009911void
Victor Stinner23e56682011-10-03 03:54:37 +02009912PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009913{
Victor Stinner23e56682011-10-03 03:54:37 +02009914 PyObject *left, *res;
9915
9916 if (p_left == NULL) {
9917 if (!PyErr_Occurred())
9918 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 return;
9920 }
Victor Stinner23e56682011-10-03 03:54:37 +02009921 left = *p_left;
9922 if (right == NULL || !PyUnicode_Check(left)) {
9923 if (!PyErr_Occurred())
9924 PyErr_BadInternalCall();
9925 goto error;
9926 }
9927
9928 if (PyUnicode_CheckExact(left) && left != unicode_empty
9929 && PyUnicode_CheckExact(right) && right != unicode_empty
9930 && unicode_resizable(left)
9931 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9932 || _PyUnicode_WSTR(left) != NULL))
9933 {
9934 Py_ssize_t u_len, v_len, new_len, copied;
9935
9936 /* FIXME: don't make wstr string ready */
9937 if (PyUnicode_READY(left))
9938 goto error;
9939 if (PyUnicode_READY(right))
9940 goto error;
9941
9942 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9943 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9944 {
9945 u_len = PyUnicode_GET_LENGTH(left);
9946 v_len = PyUnicode_GET_LENGTH(right);
9947 if (u_len > PY_SSIZE_T_MAX - v_len) {
9948 PyErr_SetString(PyExc_OverflowError,
9949 "strings are too large to concat");
9950 goto error;
9951 }
9952 new_len = u_len + v_len;
9953
9954 /* Now we own the last reference to 'left', so we can resize it
9955 * in-place.
9956 */
9957 if (unicode_resize(&left, new_len) != 0) {
9958 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9959 * deallocated so it cannot be put back into
9960 * 'variable'. The MemoryError is raised when there
9961 * is no value in 'variable', which might (very
9962 * remotely) be a cause of incompatibilities.
9963 */
9964 goto error;
9965 }
9966 /* copy 'right' into the newly allocated area of 'left' */
9967 copied = PyUnicode_CopyCharacters(left, u_len,
9968 right, 0,
9969 v_len);
9970 assert(0 <= copied);
9971 *p_left = left;
9972 return;
9973 }
9974 }
9975
9976 res = PyUnicode_Concat(left, right);
9977 if (res == NULL)
9978 goto error;
9979 Py_DECREF(left);
9980 *p_left = res;
9981 return;
9982
9983error:
9984 Py_DECREF(*p_left);
9985 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009986}
9987
9988void
9989PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9990{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009991 PyUnicode_Append(pleft, right);
9992 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009993}
9994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009995PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009998Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009999string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010000interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
10002static PyObject *
10003unicode_count(PyUnicodeObject *self, PyObject *args)
10004{
10005 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010006 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010007 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 int kind1, kind2, kind;
10010 void *buf1, *buf2;
10011 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012
Jesus Ceaac451502011-04-20 17:09:23 +020010013 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10014 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 kind1 = PyUnicode_KIND(self);
10018 kind2 = PyUnicode_KIND(substring);
10019 kind = kind1 > kind2 ? kind1 : kind2;
10020 buf1 = PyUnicode_DATA(self);
10021 buf2 = PyUnicode_DATA(substring);
10022 if (kind1 != kind)
10023 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10024 if (!buf1) {
10025 Py_DECREF(substring);
10026 return NULL;
10027 }
10028 if (kind2 != kind)
10029 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10030 if (!buf2) {
10031 Py_DECREF(substring);
10032 if (kind1 != kind) PyMem_Free(buf1);
10033 return NULL;
10034 }
10035 len1 = PyUnicode_GET_LENGTH(self);
10036 len2 = PyUnicode_GET_LENGTH(substring);
10037
10038 ADJUST_INDICES(start, end, len1);
10039 switch(kind) {
10040 case PyUnicode_1BYTE_KIND:
10041 iresult = ucs1lib_count(
10042 ((Py_UCS1*)buf1) + start, end - start,
10043 buf2, len2, PY_SSIZE_T_MAX
10044 );
10045 break;
10046 case PyUnicode_2BYTE_KIND:
10047 iresult = ucs2lib_count(
10048 ((Py_UCS2*)buf1) + start, end - start,
10049 buf2, len2, PY_SSIZE_T_MAX
10050 );
10051 break;
10052 case PyUnicode_4BYTE_KIND:
10053 iresult = ucs4lib_count(
10054 ((Py_UCS4*)buf1) + start, end - start,
10055 buf2, len2, PY_SSIZE_T_MAX
10056 );
10057 break;
10058 default:
10059 assert(0); iresult = 0;
10060 }
10061
10062 result = PyLong_FromSsize_t(iresult);
10063
10064 if (kind1 != kind)
10065 PyMem_Free(buf1);
10066 if (kind2 != kind)
10067 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068
10069 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010070
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071 return result;
10072}
10073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010074PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010075 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010077Encode S using the codec registered for encoding. Default encoding\n\
10078is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010079handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010080a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10081'xmlcharrefreplace' as well as any other name registered with\n\
10082codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083
10084static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010085unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010087 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088 char *encoding = NULL;
10089 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010090
Benjamin Peterson308d6372009-09-18 21:42:35 +000010091 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10092 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010094 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010095}
10096
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010097PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010098 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099\n\
10100Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010101If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
10103static PyObject*
10104unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10105{
10106 Py_UNICODE *e;
10107 Py_UNICODE *p;
10108 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010109 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111 PyUnicodeObject *u;
10112 int tabsize = 8;
10113
10114 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10118 return NULL;
10119
Thomas Wouters7e474022000-07-16 12:04:32 +000010120 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010121 i = 0; /* chars up to and including most recent \n or \r */
10122 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10124 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 if (tabsize > 0) {
10127 incr = tabsize - (j % tabsize); /* cannot overflow */
10128 if (j > PY_SSIZE_T_MAX - incr)
10129 goto overflow1;
10130 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010131 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 if (j > PY_SSIZE_T_MAX - 1)
10135 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136 j++;
10137 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 if (i > PY_SSIZE_T_MAX - j)
10139 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010141 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142 }
10143 }
10144
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010145 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010147
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148 /* Second pass: create output string and fill it */
10149 u = _PyUnicode_New(i + j);
10150 if (!u)
10151 return NULL;
10152
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010153 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 q = _PyUnicode_WSTR(u); /* next output char */
10155 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010159 if (tabsize > 0) {
10160 i = tabsize - (j % tabsize);
10161 j += i;
10162 while (i--) {
10163 if (q >= qe)
10164 goto overflow2;
10165 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010166 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010168 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010169 else {
10170 if (q >= qe)
10171 goto overflow2;
10172 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010173 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 if (*p == '\n' || *p == '\r')
10175 j = 0;
10176 }
10177
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010178 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 Py_DECREF(u);
10180 return NULL;
10181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010183
10184 overflow2:
10185 Py_DECREF(u);
10186 overflow1:
10187 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10188 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010189}
10190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010191PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010193\n\
10194Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010195such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196arguments start and end are interpreted as in slice notation.\n\
10197\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010198Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
10200static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202{
Jesus Ceaac451502011-04-20 17:09:23 +020010203 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010204 Py_ssize_t start;
10205 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010206 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Jesus Ceaac451502011-04-20 17:09:23 +020010208 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10209 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (PyUnicode_READY(self) == -1)
10213 return NULL;
10214 if (PyUnicode_READY(substring) == -1)
10215 return NULL;
10216
10217 result = any_find_slice(
10218 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10219 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221
10222 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 if (result == -2)
10225 return NULL;
10226
Christian Heimes217cfd12007-12-02 14:31:20 +000010227 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228}
10229
10230static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010231unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010233 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10234 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237}
10238
Guido van Rossumc2504932007-09-18 19:42:40 +000010239/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010240 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010241static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010242unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243{
Guido van Rossumc2504932007-09-18 19:42:40 +000010244 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010245 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (_PyUnicode_HASH(self) != -1)
10248 return _PyUnicode_HASH(self);
10249 if (PyUnicode_READY(self) == -1)
10250 return -1;
10251 len = PyUnicode_GET_LENGTH(self);
10252
10253 /* The hash function as a macro, gets expanded three times below. */
10254#define HASH(P) \
10255 x = (Py_uhash_t)*P << 7; \
10256 while (--len >= 0) \
10257 x = (1000003*x) ^ (Py_uhash_t)*P++;
10258
10259 switch (PyUnicode_KIND(self)) {
10260 case PyUnicode_1BYTE_KIND: {
10261 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10262 HASH(c);
10263 break;
10264 }
10265 case PyUnicode_2BYTE_KIND: {
10266 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10267 HASH(s);
10268 break;
10269 }
10270 default: {
10271 Py_UCS4 *l;
10272 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10273 "Impossible switch case in unicode_hash");
10274 l = PyUnicode_4BYTE_DATA(self);
10275 HASH(l);
10276 break;
10277 }
10278 }
10279 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10280
Guido van Rossumc2504932007-09-18 19:42:40 +000010281 if (x == -1)
10282 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010284 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010288PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010291Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292
10293static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010296 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010297 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010298 Py_ssize_t start;
10299 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
Jesus Ceaac451502011-04-20 17:09:23 +020010301 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10302 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (PyUnicode_READY(self) == -1)
10306 return NULL;
10307 if (PyUnicode_READY(substring) == -1)
10308 return NULL;
10309
10310 result = any_find_slice(
10311 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10312 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314
10315 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (result == -2)
10318 return NULL;
10319
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320 if (result < 0) {
10321 PyErr_SetString(PyExc_ValueError, "substring not found");
10322 return NULL;
10323 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010324
Christian Heimes217cfd12007-12-02 14:31:20 +000010325 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326}
10327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010328PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010331Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010332at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333
10334static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010335unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 Py_ssize_t i, length;
10338 int kind;
10339 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 int cased;
10341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_READY(self) == -1)
10343 return NULL;
10344 length = PyUnicode_GET_LENGTH(self);
10345 kind = PyUnicode_KIND(self);
10346 data = PyUnicode_DATA(self);
10347
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (length == 1)
10350 return PyBool_FromLong(
10351 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010353 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010355 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010356
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 for (i = 0; i < length; i++) {
10359 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010360
Benjamin Peterson29060642009-01-31 22:14:21 +000010361 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10362 return PyBool_FromLong(0);
10363 else if (!cased && Py_UNICODE_ISLOWER(ch))
10364 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010366 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367}
10368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010369PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010370 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010372Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010373at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
10375static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010376unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 Py_ssize_t i, length;
10379 int kind;
10380 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 int cased;
10382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (PyUnicode_READY(self) == -1)
10384 return NULL;
10385 length = PyUnicode_GET_LENGTH(self);
10386 kind = PyUnicode_KIND(self);
10387 data = PyUnicode_DATA(self);
10388
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 if (length == 1)
10391 return PyBool_FromLong(
10392 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010394 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010397
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 for (i = 0; i < length; i++) {
10400 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010401
Benjamin Peterson29060642009-01-31 22:14:21 +000010402 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10403 return PyBool_FromLong(0);
10404 else if (!cased && Py_UNICODE_ISUPPER(ch))
10405 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010407 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408}
10409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010410PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010411 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010413Return True if S is a titlecased string and there is at least one\n\
10414character in S, i.e. upper- and titlecase characters may only\n\
10415follow uncased characters and lowercase characters only cased ones.\n\
10416Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417
10418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010419unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 Py_ssize_t i, length;
10422 int kind;
10423 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424 int cased, previous_is_cased;
10425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (PyUnicode_READY(self) == -1)
10427 return NULL;
10428 length = PyUnicode_GET_LENGTH(self);
10429 kind = PyUnicode_KIND(self);
10430 data = PyUnicode_DATA(self);
10431
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 if (length == 1) {
10434 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10435 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10436 (Py_UNICODE_ISUPPER(ch) != 0));
10437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010439 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010441 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010442
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 cased = 0;
10444 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 for (i = 0; i < length; i++) {
10446 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010447
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10449 if (previous_is_cased)
10450 return PyBool_FromLong(0);
10451 previous_is_cased = 1;
10452 cased = 1;
10453 }
10454 else if (Py_UNICODE_ISLOWER(ch)) {
10455 if (!previous_is_cased)
10456 return PyBool_FromLong(0);
10457 previous_is_cased = 1;
10458 cased = 1;
10459 }
10460 else
10461 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010463 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464}
10465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010466PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010468\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010469Return True if all characters in S are whitespace\n\
10470and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471
10472static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010473unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 Py_ssize_t i, length;
10476 int kind;
10477 void *data;
10478
10479 if (PyUnicode_READY(self) == -1)
10480 return NULL;
10481 length = PyUnicode_GET_LENGTH(self);
10482 kind = PyUnicode_KIND(self);
10483 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (length == 1)
10487 return PyBool_FromLong(
10488 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010490 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010492 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 for (i = 0; i < length; i++) {
10495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010496 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010499 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500}
10501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010502PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010504\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010505Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010507
10508static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010509unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010510{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 Py_ssize_t i, length;
10512 int kind;
10513 void *data;
10514
10515 if (PyUnicode_READY(self) == -1)
10516 return NULL;
10517 length = PyUnicode_GET_LENGTH(self);
10518 kind = PyUnicode_KIND(self);
10519 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010520
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010521 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (length == 1)
10523 return PyBool_FromLong(
10524 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010525
10526 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 for (i = 0; i < length; i++) {
10531 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010532 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010534 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010535}
10536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010537PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010540Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010541and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010542
10543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010544unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 int kind;
10547 void *data;
10548 Py_ssize_t len, i;
10549
10550 if (PyUnicode_READY(self) == -1)
10551 return NULL;
10552
10553 kind = PyUnicode_KIND(self);
10554 data = PyUnicode_DATA(self);
10555 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010556
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010557 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (len == 1) {
10559 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10560 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10561 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010562
10563 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010565 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 for (i = 0; i < len; i++) {
10568 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010569 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010571 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010572 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010573}
10574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010575PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010578Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010579False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580
10581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010582unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 Py_ssize_t i, length;
10585 int kind;
10586 void *data;
10587
10588 if (PyUnicode_READY(self) == -1)
10589 return NULL;
10590 length = PyUnicode_GET_LENGTH(self);
10591 kind = PyUnicode_KIND(self);
10592 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 if (length == 1)
10596 return PyBool_FromLong(
10597 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010599 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010601 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 for (i = 0; i < length; i++) {
10604 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010605 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010607 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608}
10609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010610PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010613Return True if all characters in S are digits\n\
10614and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
10616static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010617unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 Py_ssize_t i, length;
10620 int kind;
10621 void *data;
10622
10623 if (PyUnicode_READY(self) == -1)
10624 return NULL;
10625 length = PyUnicode_GET_LENGTH(self);
10626 kind = PyUnicode_KIND(self);
10627 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 if (length == 1) {
10631 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10632 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010635 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010637 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 for (i = 0; i < length; i++) {
10640 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644}
10645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010646PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010649Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010650False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651
10652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010653unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 Py_ssize_t i, length;
10656 int kind;
10657 void *data;
10658
10659 if (PyUnicode_READY(self) == -1)
10660 return NULL;
10661 length = PyUnicode_GET_LENGTH(self);
10662 kind = PyUnicode_KIND(self);
10663 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (length == 1)
10667 return PyBool_FromLong(
10668 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010670 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010672 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 for (i = 0; i < length; i++) {
10675 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010678 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679}
10680
Martin v. Löwis47383402007-08-15 07:32:56 +000010681int
10682PyUnicode_IsIdentifier(PyObject *self)
10683{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 int kind;
10685 void *data;
10686 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010687 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (PyUnicode_READY(self) == -1) {
10690 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010691 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 }
10693
10694 /* Special case for empty strings */
10695 if (PyUnicode_GET_LENGTH(self) == 0)
10696 return 0;
10697 kind = PyUnicode_KIND(self);
10698 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010699
10700 /* PEP 3131 says that the first character must be in
10701 XID_Start and subsequent characters in XID_Continue,
10702 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010703 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010704 letters, digits, underscore). However, given the current
10705 definition of XID_Start and XID_Continue, it is sufficient
10706 to check just for these, except that _ must be allowed
10707 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010709 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010710 return 0;
10711
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010712 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010714 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010715 return 1;
10716}
10717
10718PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010720\n\
10721Return True if S is a valid identifier according\n\
10722to the language definition.");
10723
10724static PyObject*
10725unicode_isidentifier(PyObject *self)
10726{
10727 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10728}
10729
Georg Brandl559e5d72008-06-11 18:37:52 +000010730PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010731 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010732\n\
10733Return True if all characters in S are considered\n\
10734printable in repr() or S is empty, False otherwise.");
10735
10736static PyObject*
10737unicode_isprintable(PyObject *self)
10738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 Py_ssize_t i, length;
10740 int kind;
10741 void *data;
10742
10743 if (PyUnicode_READY(self) == -1)
10744 return NULL;
10745 length = PyUnicode_GET_LENGTH(self);
10746 kind = PyUnicode_KIND(self);
10747 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010748
10749 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (length == 1)
10751 return PyBool_FromLong(
10752 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 for (i = 0; i < length; i++) {
10755 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010756 Py_RETURN_FALSE;
10757 }
10758 }
10759 Py_RETURN_TRUE;
10760}
10761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010762PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010763 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764\n\
10765Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010766iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
10768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010769unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010771 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772}
10773
Martin v. Löwis18e16552006-02-15 17:27:45 +000010774static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775unicode_length(PyUnicodeObject *self)
10776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (PyUnicode_READY(self) == -1)
10778 return -1;
10779 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780}
10781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010782PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010785Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010786done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
10788static PyObject *
10789unicode_ljust(PyUnicodeObject *self, PyObject *args)
10790{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010791 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 Py_UCS4 fillchar = ' ';
10793
10794 if (PyUnicode_READY(self) == -1)
10795 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010796
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010797 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 return NULL;
10799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 Py_INCREF(self);
10802 return (PyObject*) self;
10803 }
10804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010805 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806}
10807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010808PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
10813static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010814unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816 return fixup(self, fixlower);
10817}
10818
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010819#define LEFTSTRIP 0
10820#define RIGHTSTRIP 1
10821#define BOTHSTRIP 2
10822
10823/* Arrays indexed by above */
10824static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10825
10826#define STRIPNAME(i) (stripformat[i]+3)
10827
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010828/* externally visible for str.strip(unicode) */
10829PyObject *
10830_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 void *data;
10833 int kind;
10834 Py_ssize_t i, j, len;
10835 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10838 return NULL;
10839
10840 kind = PyUnicode_KIND(self);
10841 data = PyUnicode_DATA(self);
10842 len = PyUnicode_GET_LENGTH(self);
10843 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10844 PyUnicode_DATA(sepobj),
10845 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010846
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 i = 0;
10848 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 while (i < len &&
10850 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 i++;
10852 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010853 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010854
Benjamin Peterson14339b62009-01-31 16:36:08 +000010855 j = len;
10856 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010857 do {
10858 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 } while (j >= i &&
10860 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010862 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010863
Victor Stinner12bab6d2011-10-01 01:53:49 +020010864 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865}
10866
10867PyObject*
10868PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10869{
10870 unsigned char *data;
10871 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010872 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873
Victor Stinnerde636f32011-10-01 03:55:54 +020010874 if (PyUnicode_READY(self) == -1)
10875 return NULL;
10876
10877 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10878
Victor Stinner12bab6d2011-10-01 01:53:49 +020010879 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010881 if (PyUnicode_CheckExact(self)) {
10882 Py_INCREF(self);
10883 return self;
10884 }
10885 else
10886 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 }
10888
Victor Stinner12bab6d2011-10-01 01:53:49 +020010889 length = end - start;
10890 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010891 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892
Victor Stinnerde636f32011-10-01 03:55:54 +020010893 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010894 PyErr_SetString(PyExc_IndexError, "string index out of range");
10895 return NULL;
10896 }
10897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 kind = PyUnicode_KIND(self);
10899 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010900 return PyUnicode_FromKindAndData(kind,
10901 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010902 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
10905static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010906do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 int kind;
10909 void *data;
10910 Py_ssize_t len, i, j;
10911
10912 if (PyUnicode_READY(self) == -1)
10913 return NULL;
10914
10915 kind = PyUnicode_KIND(self);
10916 data = PyUnicode_DATA(self);
10917 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010918
Benjamin Peterson14339b62009-01-31 16:36:08 +000010919 i = 0;
10920 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010922 i++;
10923 }
10924 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010925
Benjamin Peterson14339b62009-01-31 16:36:08 +000010926 j = len;
10927 if (striptype != LEFTSTRIP) {
10928 do {
10929 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010931 j++;
10932 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010933
Victor Stinner12bab6d2011-10-01 01:53:49 +020010934 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935}
10936
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010937
10938static PyObject *
10939do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10940{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942
Benjamin Peterson14339b62009-01-31 16:36:08 +000010943 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10944 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010945
Benjamin Peterson14339b62009-01-31 16:36:08 +000010946 if (sep != NULL && sep != Py_None) {
10947 if (PyUnicode_Check(sep))
10948 return _PyUnicode_XStrip(self, striptype, sep);
10949 else {
10950 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010951 "%s arg must be None or str",
10952 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010953 return NULL;
10954 }
10955 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010956
Benjamin Peterson14339b62009-01-31 16:36:08 +000010957 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010958}
10959
10960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010962 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010963\n\
10964Return a copy of the string S with leading and trailing\n\
10965whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010966If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010967
10968static PyObject *
10969unicode_strip(PyUnicodeObject *self, PyObject *args)
10970{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010971 if (PyTuple_GET_SIZE(args) == 0)
10972 return do_strip(self, BOTHSTRIP); /* Common case */
10973 else
10974 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010975}
10976
10977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010980\n\
10981Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010982If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010983
10984static PyObject *
10985unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10986{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010987 if (PyTuple_GET_SIZE(args) == 0)
10988 return do_strip(self, LEFTSTRIP); /* Common case */
10989 else
10990 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010991}
10992
10993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010994PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010996\n\
10997Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010998If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010999
11000static PyObject *
11001unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11002{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011003 if (PyTuple_GET_SIZE(args) == 0)
11004 return do_strip(self, RIGHTSTRIP); /* Common case */
11005 else
11006 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011007}
11008
11009
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011011unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012{
11013 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Georg Brandl222de0f2009-04-12 12:01:50 +000011016 if (len < 1) {
11017 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011018 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
Tim Peters7a29bd52001-09-12 03:03:31 +000011021 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 /* no repeat, return original string */
11023 Py_INCREF(str);
11024 return (PyObject*) str;
11025 }
Tim Peters8f422462000-09-09 06:13:41 +000011026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 if (PyUnicode_READY(str) == -1)
11028 return NULL;
11029
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011030 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011031 PyErr_SetString(PyExc_OverflowError,
11032 "repeated string is too long");
11033 return NULL;
11034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 if (!u)
11039 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011040 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (PyUnicode_GET_LENGTH(str) == 1) {
11043 const int kind = PyUnicode_KIND(str);
11044 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11045 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011046 if (kind == PyUnicode_1BYTE_KIND)
11047 memset(to, (unsigned char)fill_char, len);
11048 else {
11049 for (n = 0; n < len; ++n)
11050 PyUnicode_WRITE(kind, to, n, fill_char);
11051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 }
11053 else {
11054 /* number of characters copied this far */
11055 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11056 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11057 char *to = (char *) PyUnicode_DATA(u);
11058 Py_MEMCPY(to, PyUnicode_DATA(str),
11059 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 n = (done <= nchars-done) ? done : nchars-done;
11062 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011063 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011064 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 }
11066
11067 return (PyObject*) u;
11068}
11069
Alexander Belopolsky40018472011-02-26 01:02:56 +000011070PyObject *
11071PyUnicode_Replace(PyObject *obj,
11072 PyObject *subobj,
11073 PyObject *replobj,
11074 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075{
11076 PyObject *self;
11077 PyObject *str1;
11078 PyObject *str2;
11079 PyObject *result;
11080
11081 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011082 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011085 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011086 Py_DECREF(self);
11087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 }
11089 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011090 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011091 Py_DECREF(self);
11092 Py_DECREF(str1);
11093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 Py_DECREF(self);
11097 Py_DECREF(str1);
11098 Py_DECREF(str2);
11099 return result;
11100}
11101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011102PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011103 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104\n\
11105Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011106old replaced by new. If the optional argument count is\n\
11107given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108
11109static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 PyObject *str1;
11113 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011114 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 PyObject *result;
11116
Martin v. Löwis18e16552006-02-15 17:27:45 +000011117 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011120 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 str1 = PyUnicode_FromObject(str1);
11122 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11123 return NULL;
11124 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011125 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011126 Py_DECREF(str1);
11127 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011128 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
11130 result = replace(self, str1, str2, maxcount);
11131
11132 Py_DECREF(str1);
11133 Py_DECREF(str2);
11134 return result;
11135}
11136
Alexander Belopolsky40018472011-02-26 01:02:56 +000011137static PyObject *
11138unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011140 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 Py_ssize_t isize;
11142 Py_ssize_t osize, squote, dquote, i, o;
11143 Py_UCS4 max, quote;
11144 int ikind, okind;
11145 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011148 return NULL;
11149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 isize = PyUnicode_GET_LENGTH(unicode);
11151 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 /* Compute length of output, quote characters, and
11154 maximum character */
11155 osize = 2; /* quotes */
11156 max = 127;
11157 squote = dquote = 0;
11158 ikind = PyUnicode_KIND(unicode);
11159 for (i = 0; i < isize; i++) {
11160 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11161 switch (ch) {
11162 case '\'': squote++; osize++; break;
11163 case '"': dquote++; osize++; break;
11164 case '\\': case '\t': case '\r': case '\n':
11165 osize += 2; break;
11166 default:
11167 /* Fast-path ASCII */
11168 if (ch < ' ' || ch == 0x7f)
11169 osize += 4; /* \xHH */
11170 else if (ch < 0x7f)
11171 osize++;
11172 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11173 osize++;
11174 max = ch > max ? ch : max;
11175 }
11176 else if (ch < 0x100)
11177 osize += 4; /* \xHH */
11178 else if (ch < 0x10000)
11179 osize += 6; /* \uHHHH */
11180 else
11181 osize += 10; /* \uHHHHHHHH */
11182 }
11183 }
11184
11185 quote = '\'';
11186 if (squote) {
11187 if (dquote)
11188 /* Both squote and dquote present. Use squote,
11189 and escape them */
11190 osize += squote;
11191 else
11192 quote = '"';
11193 }
11194
11195 repr = PyUnicode_New(osize, max);
11196 if (repr == NULL)
11197 return NULL;
11198 okind = PyUnicode_KIND(repr);
11199 odata = PyUnicode_DATA(repr);
11200
11201 PyUnicode_WRITE(okind, odata, 0, quote);
11202 PyUnicode_WRITE(okind, odata, osize-1, quote);
11203
11204 for (i = 0, o = 1; i < isize; i++) {
11205 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011206
11207 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if ((ch == quote) || (ch == '\\')) {
11209 PyUnicode_WRITE(okind, odata, o++, '\\');
11210 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011211 continue;
11212 }
11213
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011215 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 PyUnicode_WRITE(okind, odata, o++, '\\');
11217 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011218 }
11219 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 PyUnicode_WRITE(okind, odata, o++, '\\');
11221 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011222 }
11223 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 PyUnicode_WRITE(okind, odata, o++, '\\');
11225 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011226 }
11227
11228 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011229 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 PyUnicode_WRITE(okind, odata, o++, '\\');
11231 PyUnicode_WRITE(okind, odata, o++, 'x');
11232 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11233 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011234 }
11235
Georg Brandl559e5d72008-06-11 18:37:52 +000011236 /* Copy ASCII characters as-is */
11237 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011239 }
11240
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011242 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011243 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011244 (categories Z* and C* except ASCII space)
11245 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011247 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 if (ch <= 0xff) {
11249 PyUnicode_WRITE(okind, odata, o++, '\\');
11250 PyUnicode_WRITE(okind, odata, o++, 'x');
11251 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11252 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011253 }
11254 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 else if (ch >= 0x10000) {
11256 PyUnicode_WRITE(okind, odata, o++, '\\');
11257 PyUnicode_WRITE(okind, odata, o++, 'U');
11258 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11259 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11260 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11261 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11262 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11263 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11264 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11265 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011266 }
11267 /* Map 16-bit characters to '\uxxxx' */
11268 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 PyUnicode_WRITE(okind, odata, o++, '\\');
11270 PyUnicode_WRITE(okind, odata, o++, 'u');
11271 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11272 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11273 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11274 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011275 }
11276 }
11277 /* Copy characters as-is */
11278 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011280 }
11281 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011284 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285}
11286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011287PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289\n\
11290Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011291such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292arguments start and end are interpreted as in slice notation.\n\
11293\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011294Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298{
Jesus Ceaac451502011-04-20 17:09:23 +020011299 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011300 Py_ssize_t start;
11301 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011302 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
Jesus Ceaac451502011-04-20 17:09:23 +020011304 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11305 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 if (PyUnicode_READY(self) == -1)
11309 return NULL;
11310 if (PyUnicode_READY(substring) == -1)
11311 return NULL;
11312
11313 result = any_find_slice(
11314 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11315 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011316 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317
11318 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (result == -2)
11321 return NULL;
11322
Christian Heimes217cfd12007-12-02 14:31:20 +000011323 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324}
11325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011329Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330
11331static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333{
Jesus Ceaac451502011-04-20 17:09:23 +020011334 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011335 Py_ssize_t start;
11336 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011337 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Jesus Ceaac451502011-04-20 17:09:23 +020011339 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11340 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (PyUnicode_READY(self) == -1)
11344 return NULL;
11345 if (PyUnicode_READY(substring) == -1)
11346 return NULL;
11347
11348 result = any_find_slice(
11349 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11350 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011351 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
11353 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 if (result == -2)
11356 return NULL;
11357
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358 if (result < 0) {
11359 PyErr_SetString(PyExc_ValueError, "substring not found");
11360 return NULL;
11361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362
Christian Heimes217cfd12007-12-02 14:31:20 +000011363 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011369Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011370done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372static PyObject *
11373unicode_rjust(PyUnicodeObject *self, PyObject *args)
11374{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011375 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 Py_UCS4 fillchar = ' ';
11377
Victor Stinnere9a29352011-10-01 02:14:59 +020011378 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011380
Victor Stinnere9a29352011-10-01 02:14:59 +020011381 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 return NULL;
11383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 Py_INCREF(self);
11386 return (PyObject*) self;
11387 }
11388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390}
11391
Alexander Belopolsky40018472011-02-26 01:02:56 +000011392PyObject *
11393PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394{
11395 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011396
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 s = PyUnicode_FromObject(s);
11398 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011399 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 if (sep != NULL) {
11401 sep = PyUnicode_FromObject(sep);
11402 if (sep == NULL) {
11403 Py_DECREF(s);
11404 return NULL;
11405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 }
11407
11408 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11409
11410 Py_DECREF(s);
11411 Py_XDECREF(sep);
11412 return result;
11413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011416 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
11418Return a list of the words in S, using sep as the\n\
11419delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011420splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011421whitespace string is a separator and empty strings are\n\
11422removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
11424static PyObject*
11425unicode_split(PyUnicodeObject *self, PyObject *args)
11426{
11427 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011428 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
Martin v. Löwis18e16552006-02-15 17:27:45 +000011430 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 return NULL;
11432
11433 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439}
11440
Thomas Wouters477c8d52006-05-27 19:21:47 +000011441PyObject *
11442PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11443{
11444 PyObject* str_obj;
11445 PyObject* sep_obj;
11446 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 int kind1, kind2, kind;
11448 void *buf1 = NULL, *buf2 = NULL;
11449 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011450
11451 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011452 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011454 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011456 Py_DECREF(str_obj);
11457 return NULL;
11458 }
11459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 kind1 = PyUnicode_KIND(str_in);
11461 kind2 = PyUnicode_KIND(sep_obj);
11462 kind = kind1 > kind2 ? kind1 : kind2;
11463 buf1 = PyUnicode_DATA(str_in);
11464 if (kind1 != kind)
11465 buf1 = _PyUnicode_AsKind(str_in, kind);
11466 if (!buf1)
11467 goto onError;
11468 buf2 = PyUnicode_DATA(sep_obj);
11469 if (kind2 != kind)
11470 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11471 if (!buf2)
11472 goto onError;
11473 len1 = PyUnicode_GET_LENGTH(str_obj);
11474 len2 = PyUnicode_GET_LENGTH(sep_obj);
11475
11476 switch(PyUnicode_KIND(str_in)) {
11477 case PyUnicode_1BYTE_KIND:
11478 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11479 break;
11480 case PyUnicode_2BYTE_KIND:
11481 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11482 break;
11483 case PyUnicode_4BYTE_KIND:
11484 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11485 break;
11486 default:
11487 assert(0);
11488 out = 0;
11489 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011490
11491 Py_DECREF(sep_obj);
11492 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (kind1 != kind)
11494 PyMem_Free(buf1);
11495 if (kind2 != kind)
11496 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011497
11498 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 onError:
11500 Py_DECREF(sep_obj);
11501 Py_DECREF(str_obj);
11502 if (kind1 != kind && buf1)
11503 PyMem_Free(buf1);
11504 if (kind2 != kind && buf2)
11505 PyMem_Free(buf2);
11506 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011507}
11508
11509
11510PyObject *
11511PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11512{
11513 PyObject* str_obj;
11514 PyObject* sep_obj;
11515 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 int kind1, kind2, kind;
11517 void *buf1 = NULL, *buf2 = NULL;
11518 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519
11520 str_obj = PyUnicode_FromObject(str_in);
11521 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011523 sep_obj = PyUnicode_FromObject(sep_in);
11524 if (!sep_obj) {
11525 Py_DECREF(str_obj);
11526 return NULL;
11527 }
11528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 kind1 = PyUnicode_KIND(str_in);
11530 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011531 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 buf1 = PyUnicode_DATA(str_in);
11533 if (kind1 != kind)
11534 buf1 = _PyUnicode_AsKind(str_in, kind);
11535 if (!buf1)
11536 goto onError;
11537 buf2 = PyUnicode_DATA(sep_obj);
11538 if (kind2 != kind)
11539 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11540 if (!buf2)
11541 goto onError;
11542 len1 = PyUnicode_GET_LENGTH(str_obj);
11543 len2 = PyUnicode_GET_LENGTH(sep_obj);
11544
11545 switch(PyUnicode_KIND(str_in)) {
11546 case PyUnicode_1BYTE_KIND:
11547 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11548 break;
11549 case PyUnicode_2BYTE_KIND:
11550 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11551 break;
11552 case PyUnicode_4BYTE_KIND:
11553 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11554 break;
11555 default:
11556 assert(0);
11557 out = 0;
11558 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011559
11560 Py_DECREF(sep_obj);
11561 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 if (kind1 != kind)
11563 PyMem_Free(buf1);
11564 if (kind2 != kind)
11565 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011566
11567 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 onError:
11569 Py_DECREF(sep_obj);
11570 Py_DECREF(str_obj);
11571 if (kind1 != kind && buf1)
11572 PyMem_Free(buf1);
11573 if (kind2 != kind && buf2)
11574 PyMem_Free(buf2);
11575 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011576}
11577
11578PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011580\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011581Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011582the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011583found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011584
11585static PyObject*
11586unicode_partition(PyUnicodeObject *self, PyObject *separator)
11587{
11588 return PyUnicode_Partition((PyObject *)self, separator);
11589}
11590
11591PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011592 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011593\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011594Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011595the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011596separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011597
11598static PyObject*
11599unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11600{
11601 return PyUnicode_RPartition((PyObject *)self, separator);
11602}
11603
Alexander Belopolsky40018472011-02-26 01:02:56 +000011604PyObject *
11605PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011606{
11607 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011608
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011609 s = PyUnicode_FromObject(s);
11610 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011611 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 if (sep != NULL) {
11613 sep = PyUnicode_FromObject(sep);
11614 if (sep == NULL) {
11615 Py_DECREF(s);
11616 return NULL;
11617 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011618 }
11619
11620 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11621
11622 Py_DECREF(s);
11623 Py_XDECREF(sep);
11624 return result;
11625}
11626
11627PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011629\n\
11630Return a list of the words in S, using sep as the\n\
11631delimiter string, starting at the end of the string and\n\
11632working to the front. If maxsplit is given, at most maxsplit\n\
11633splits are done. If sep is not specified, any whitespace string\n\
11634is a separator.");
11635
11636static PyObject*
11637unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11638{
11639 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011640 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011641
Martin v. Löwis18e16552006-02-15 17:27:45 +000011642 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011643 return NULL;
11644
11645 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011647 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011649 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011651}
11652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011653PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011654 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655\n\
11656Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011657Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
11660static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011661unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011663 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011664 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011666 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11667 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 return NULL;
11669
Guido van Rossum86662912000-04-11 15:38:46 +000011670 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671}
11672
11673static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011674PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675{
Walter Dörwald346737f2007-05-31 10:44:43 +000011676 if (PyUnicode_CheckExact(self)) {
11677 Py_INCREF(self);
11678 return self;
11679 } else
11680 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011681 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682}
11683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011684PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686\n\
11687Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011688and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
11690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011691unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 return fixup(self, fixswapcase);
11694}
11695
Georg Brandlceee0772007-11-27 23:48:05 +000011696PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011698\n\
11699Return a translation table usable for str.translate().\n\
11700If there is only one argument, it must be a dictionary mapping Unicode\n\
11701ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011702Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011703If there are two arguments, they must be strings of equal length, and\n\
11704in the resulting dictionary, each character in x will be mapped to the\n\
11705character at the same position in y. If there is a third argument, it\n\
11706must be a string, whose characters will be mapped to None in the result.");
11707
11708static PyObject*
11709unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11710{
11711 PyObject *x, *y = NULL, *z = NULL;
11712 PyObject *new = NULL, *key, *value;
11713 Py_ssize_t i = 0;
11714 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715
Georg Brandlceee0772007-11-27 23:48:05 +000011716 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11717 return NULL;
11718 new = PyDict_New();
11719 if (!new)
11720 return NULL;
11721 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 int x_kind, y_kind, z_kind;
11723 void *x_data, *y_data, *z_data;
11724
Georg Brandlceee0772007-11-27 23:48:05 +000011725 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011726 if (!PyUnicode_Check(x)) {
11727 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11728 "be a string if there is a second argument");
11729 goto err;
11730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011732 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11733 "arguments must have equal length");
11734 goto err;
11735 }
11736 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 x_kind = PyUnicode_KIND(x);
11738 y_kind = PyUnicode_KIND(y);
11739 x_data = PyUnicode_DATA(x);
11740 y_data = PyUnicode_DATA(y);
11741 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11742 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11743 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011744 if (!key || !value)
11745 goto err;
11746 res = PyDict_SetItem(new, key, value);
11747 Py_DECREF(key);
11748 Py_DECREF(value);
11749 if (res < 0)
11750 goto err;
11751 }
11752 /* create entries for deleting chars in z */
11753 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 z_kind = PyUnicode_KIND(z);
11755 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011756 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011758 if (!key)
11759 goto err;
11760 res = PyDict_SetItem(new, key, Py_None);
11761 Py_DECREF(key);
11762 if (res < 0)
11763 goto err;
11764 }
11765 }
11766 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 int kind;
11768 void *data;
11769
Georg Brandlceee0772007-11-27 23:48:05 +000011770 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011771 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011772 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11773 "to maketrans it must be a dict");
11774 goto err;
11775 }
11776 /* copy entries into the new dict, converting string keys to int keys */
11777 while (PyDict_Next(x, &i, &key, &value)) {
11778 if (PyUnicode_Check(key)) {
11779 /* convert string keys to integer keys */
11780 PyObject *newkey;
11781 if (PyUnicode_GET_SIZE(key) != 1) {
11782 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11783 "table must be of length 1");
11784 goto err;
11785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 kind = PyUnicode_KIND(key);
11787 data = PyUnicode_DATA(key);
11788 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011789 if (!newkey)
11790 goto err;
11791 res = PyDict_SetItem(new, newkey, value);
11792 Py_DECREF(newkey);
11793 if (res < 0)
11794 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011795 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011796 /* just keep integer keys */
11797 if (PyDict_SetItem(new, key, value) < 0)
11798 goto err;
11799 } else {
11800 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11801 "be strings or integers");
11802 goto err;
11803 }
11804 }
11805 }
11806 return new;
11807 err:
11808 Py_DECREF(new);
11809 return NULL;
11810}
11811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011812PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814\n\
11815Return a copy of the string S, where all characters have been mapped\n\
11816through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011817Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011818Unmapped characters are left untouched. Characters mapped to None\n\
11819are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
11821static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825}
11826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011827PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
11832static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011833unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 return fixup(self, fixupper);
11836}
11837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011838PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011841Pad a numeric string S with zeros on the left, to fill a field\n\
11842of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
11844static PyObject *
11845unicode_zfill(PyUnicodeObject *self, PyObject *args)
11846{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011847 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011849 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 int kind;
11851 void *data;
11852 Py_UCS4 chr;
11853
11854 if (PyUnicode_READY(self) == -1)
11855 return NULL;
11856
Martin v. Löwis18e16552006-02-15 17:27:45 +000011857 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858 return NULL;
11859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011861 if (PyUnicode_CheckExact(self)) {
11862 Py_INCREF(self);
11863 return (PyObject*) self;
11864 }
11865 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011866 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 }
11868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870
11871 u = pad(self, fill, 0, '0');
11872
Walter Dörwald068325e2002-04-15 13:36:47 +000011873 if (u == NULL)
11874 return NULL;
11875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 kind = PyUnicode_KIND(u);
11877 data = PyUnicode_DATA(u);
11878 chr = PyUnicode_READ(kind, data, fill);
11879
11880 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 PyUnicode_WRITE(kind, data, 0, chr);
11883 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 }
11885
11886 return (PyObject*) u;
11887}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
11889#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011890static PyObject *
11891unicode__decimal2ascii(PyObject *self)
11892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011894}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895#endif
11896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011897PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011900Return True if S starts with the specified prefix, False otherwise.\n\
11901With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011902With optional end, stop comparing S at that position.\n\
11903prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject *
11906unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011909 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011911 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011912 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011913 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Jesus Ceaac451502011-04-20 17:09:23 +020011915 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011917 if (PyTuple_Check(subobj)) {
11918 Py_ssize_t i;
11919 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11920 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011922 if (substring == NULL)
11923 return NULL;
11924 result = tailmatch(self, substring, start, end, -1);
11925 Py_DECREF(substring);
11926 if (result) {
11927 Py_RETURN_TRUE;
11928 }
11929 }
11930 /* nothing matched */
11931 Py_RETURN_FALSE;
11932 }
11933 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011934 if (substring == NULL) {
11935 if (PyErr_ExceptionMatches(PyExc_TypeError))
11936 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11937 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011939 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011940 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011942 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943}
11944
11945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011946PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011949Return True if S ends with the specified suffix, False otherwise.\n\
11950With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011951With optional end, stop comparing S at that position.\n\
11952suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
11954static PyObject *
11955unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011958 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011960 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011961 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011962 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
Jesus Ceaac451502011-04-20 17:09:23 +020011964 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011966 if (PyTuple_Check(subobj)) {
11967 Py_ssize_t i;
11968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011971 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011973 result = tailmatch(self, substring, start, end, +1);
11974 Py_DECREF(substring);
11975 if (result) {
11976 Py_RETURN_TRUE;
11977 }
11978 }
11979 Py_RETURN_FALSE;
11980 }
11981 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011982 if (substring == NULL) {
11983 if (PyErr_ExceptionMatches(PyExc_TypeError))
11984 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11985 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011987 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011988 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011990 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991}
11992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011994
11995PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011997\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011998Return a formatted version of S, using substitutions from args and kwargs.\n\
11999The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012000
Eric Smith27bbca62010-11-04 17:06:58 +000012001PyDoc_STRVAR(format_map__doc__,
12002 "S.format_map(mapping) -> str\n\
12003\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012004Return a formatted version of S, using substitutions from mapping.\n\
12005The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012006
Eric Smith4a7d76d2008-05-30 18:10:19 +000012007static PyObject *
12008unicode__format__(PyObject* self, PyObject* args)
12009{
12010 PyObject *format_spec;
12011
12012 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12013 return NULL;
12014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12016 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012017}
12018
Eric Smith8c663262007-08-25 02:26:07 +000012019PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012021\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012022Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012023
12024static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012025unicode__sizeof__(PyUnicodeObject *v)
12026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 Py_ssize_t size;
12028
12029 /* If it's a compact object, account for base structure +
12030 character data. */
12031 if (PyUnicode_IS_COMPACT_ASCII(v))
12032 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12033 else if (PyUnicode_IS_COMPACT(v))
12034 size = sizeof(PyCompactUnicodeObject) +
12035 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12036 else {
12037 /* If it is a two-block object, account for base object, and
12038 for character block if present. */
12039 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012040 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 size += (PyUnicode_GET_LENGTH(v) + 1) *
12042 PyUnicode_CHARACTER_SIZE(v);
12043 }
12044 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012045 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020012047 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012049 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012050 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051
12052 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012053}
12054
12055PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012057
12058static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012059unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012060{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012061 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 if (!copy)
12063 return NULL;
12064 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012065}
12066
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067static PyMethodDef unicode_methods[] = {
12068
12069 /* Order is according to common usage: often used methods should
12070 appear first, since lookup is done sequentially. */
12071
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012072 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012073 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12074 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012075 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012076 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12077 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12078 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12079 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12080 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12081 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12082 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012083 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012084 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12085 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12086 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012087 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012088 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12089 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12090 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012091 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012092 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012093 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012094 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012095 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12096 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12097 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12098 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12099 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12100 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12101 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12102 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12103 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12104 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12105 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12106 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12107 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12108 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012109 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012110 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012111 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012112 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012113 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012114 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012115 {"maketrans", (PyCFunction) unicode_maketrans,
12116 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012117 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012118#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012119 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120#endif
12121
12122#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012123 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012124 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125#endif
12126
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 {NULL, NULL}
12129};
12130
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012131static PyObject *
12132unicode_mod(PyObject *v, PyObject *w)
12133{
Brian Curtindfc80e32011-08-10 20:28:54 -050012134 if (!PyUnicode_Check(v))
12135 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012137}
12138
12139static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 0, /*nb_add*/
12141 0, /*nb_subtract*/
12142 0, /*nb_multiply*/
12143 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012144};
12145
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 (lenfunc) unicode_length, /* sq_length */
12148 PyUnicode_Concat, /* sq_concat */
12149 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12150 (ssizeargfunc) unicode_getitem, /* sq_item */
12151 0, /* sq_slice */
12152 0, /* sq_ass_item */
12153 0, /* sq_ass_slice */
12154 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155};
12156
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012157static PyObject*
12158unicode_subscript(PyUnicodeObject* self, PyObject* item)
12159{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 if (PyUnicode_READY(self) == -1)
12161 return NULL;
12162
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012163 if (PyIndex_Check(item)) {
12164 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012165 if (i == -1 && PyErr_Occurred())
12166 return NULL;
12167 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012169 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012170 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012171 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012173 Py_UNICODE* result_buf;
12174 PyObject* result;
12175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012178 return NULL;
12179 }
12180
12181 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 return PyUnicode_New(0, 0);
12183 } else if (start == 0 && step == 1 &&
12184 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012185 PyUnicode_CheckExact(self)) {
12186 Py_INCREF(self);
12187 return (PyObject *)self;
12188 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012189 return PyUnicode_Substring((PyObject*)self,
12190 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012191 } else {
12192 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012193 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12194 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012195
Benjamin Peterson29060642009-01-31 22:14:21 +000012196 if (result_buf == NULL)
12197 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012198
12199 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12200 result_buf[i] = source_buf[cur];
12201 }
Tim Petersced69f82003-09-16 20:30:58 +000012202
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012203 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012204 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012205 return result;
12206 }
12207 } else {
12208 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12209 return NULL;
12210 }
12211}
12212
12213static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012214 (lenfunc)unicode_length, /* mp_length */
12215 (binaryfunc)unicode_subscript, /* mp_subscript */
12216 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012217};
12218
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220/* Helpers for PyUnicode_Format() */
12221
12222static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012223getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012225 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 (*p_argidx)++;
12228 if (arglen < 0)
12229 return args;
12230 else
12231 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
12233 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 return NULL;
12236}
12237
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012238/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012240static PyObject *
12241formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012243 char *p;
12244 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012246
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 x = PyFloat_AsDouble(v);
12248 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012249 return NULL;
12250
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012253
Eric Smith0923d1d2009-04-16 20:16:10 +000012254 p = PyOS_double_to_string(x, type, prec,
12255 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012256 if (p == NULL)
12257 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012259 PyMem_Free(p);
12260 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261}
12262
Tim Peters38fd5b62000-09-21 05:43:11 +000012263static PyObject*
12264formatlong(PyObject *val, int flags, int prec, int type)
12265{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012266 char *buf;
12267 int len;
12268 PyObject *str; /* temporary string object. */
12269 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012270
Benjamin Peterson14339b62009-01-31 16:36:08 +000012271 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12272 if (!str)
12273 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012275 Py_DECREF(str);
12276 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012277}
12278
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012281 size_t buflen,
12282 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012284 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012285 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (PyUnicode_GET_LENGTH(v) == 1) {
12287 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 buf[1] = '\0';
12289 return 1;
12290 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 goto onError;
12292 }
12293 else {
12294 /* Integer input truncated to a character */
12295 long x;
12296 x = PyLong_AsLong(v);
12297 if (x == -1 && PyErr_Occurred())
12298 goto onError;
12299
12300 if (x < 0 || x > 0x10ffff) {
12301 PyErr_SetString(PyExc_OverflowError,
12302 "%c arg not in range(0x110000)");
12303 return -1;
12304 }
12305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307 buf[1] = '\0';
12308 return 1;
12309 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012310
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012312 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012313 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012314 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315}
12316
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012317/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012318 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012319*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012320#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012321
Alexander Belopolsky40018472011-02-26 01:02:56 +000012322PyObject *
12323PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 void *fmt;
12326 int fmtkind;
12327 PyObject *result;
12328 Py_UCS4 *res, *res0;
12329 Py_UCS4 max;
12330 int kind;
12331 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012335
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012337 PyErr_BadInternalCall();
12338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12341 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 fmt = PyUnicode_DATA(uformat);
12344 fmtkind = PyUnicode_KIND(uformat);
12345 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12346 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
12348 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12350 if (res0 == NULL) {
12351 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354
12355 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 arglen = PyTuple_Size(args);
12357 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 }
12359 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 arglen = -1;
12361 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012363 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012364 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366
12367 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 if (--rescnt < 0) {
12370 rescnt = fmtcnt + 100;
12371 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12373 if (res0 == NULL){
12374 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 }
12377 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381 }
12382 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 /* Got a format specifier */
12384 int flags = 0;
12385 Py_ssize_t width = -1;
12386 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 Py_UCS4 c = '\0';
12388 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 int isnumok;
12390 PyObject *v = NULL;
12391 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 void *pbuf;
12393 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 Py_ssize_t len, len1;
12396 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 fmtpos++;
12399 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12400 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 Py_ssize_t keylen;
12402 PyObject *key;
12403 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012404
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 if (dict == NULL) {
12406 PyErr_SetString(PyExc_TypeError,
12407 "format requires a mapping");
12408 goto onError;
12409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 /* Skip over balanced parentheses */
12414 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 if (fmtcnt < 0 || pcount > 0) {
12423 PyErr_SetString(PyExc_ValueError,
12424 "incomplete format key");
12425 goto onError;
12426 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012427 key = PyUnicode_Substring((PyObject*)uformat,
12428 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 if (key == NULL)
12430 goto onError;
12431 if (args_owned) {
12432 Py_DECREF(args);
12433 args_owned = 0;
12434 }
12435 args = PyObject_GetItem(dict, key);
12436 Py_DECREF(key);
12437 if (args == NULL) {
12438 goto onError;
12439 }
12440 args_owned = 1;
12441 arglen = -1;
12442 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 case '-': flags |= F_LJUST; continue;
12447 case '+': flags |= F_SIGN; continue;
12448 case ' ': flags |= F_BLANK; continue;
12449 case '#': flags |= F_ALT; continue;
12450 case '0': flags |= F_ZERO; continue;
12451 }
12452 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012453 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 if (c == '*') {
12455 v = getnextarg(args, arglen, &argidx);
12456 if (v == NULL)
12457 goto onError;
12458 if (!PyLong_Check(v)) {
12459 PyErr_SetString(PyExc_TypeError,
12460 "* wants int");
12461 goto onError;
12462 }
12463 width = PyLong_AsLong(v);
12464 if (width == -1 && PyErr_Occurred())
12465 goto onError;
12466 if (width < 0) {
12467 flags |= F_LJUST;
12468 width = -width;
12469 }
12470 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 }
12473 else if (c >= '0' && c <= '9') {
12474 width = c - '0';
12475 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 if (c < '0' || c > '9')
12478 break;
12479 if ((width*10) / 10 != width) {
12480 PyErr_SetString(PyExc_ValueError,
12481 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012482 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 }
12484 width = width*10 + (c - '0');
12485 }
12486 }
12487 if (c == '.') {
12488 prec = 0;
12489 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 if (c == '*') {
12492 v = getnextarg(args, arglen, &argidx);
12493 if (v == NULL)
12494 goto onError;
12495 if (!PyLong_Check(v)) {
12496 PyErr_SetString(PyExc_TypeError,
12497 "* wants int");
12498 goto onError;
12499 }
12500 prec = PyLong_AsLong(v);
12501 if (prec == -1 && PyErr_Occurred())
12502 goto onError;
12503 if (prec < 0)
12504 prec = 0;
12505 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 }
12508 else if (c >= '0' && c <= '9') {
12509 prec = c - '0';
12510 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 if (c < '0' || c > '9')
12513 break;
12514 if ((prec*10) / 10 != prec) {
12515 PyErr_SetString(PyExc_ValueError,
12516 "prec too big");
12517 goto onError;
12518 }
12519 prec = prec*10 + (c - '0');
12520 }
12521 }
12522 } /* prec */
12523 if (fmtcnt >= 0) {
12524 if (c == 'h' || c == 'l' || c == 'L') {
12525 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 }
12528 }
12529 if (fmtcnt < 0) {
12530 PyErr_SetString(PyExc_ValueError,
12531 "incomplete format");
12532 goto onError;
12533 }
12534 if (c != '%') {
12535 v = getnextarg(args, arglen, &argidx);
12536 if (v == NULL)
12537 goto onError;
12538 }
12539 sign = 0;
12540 fill = ' ';
12541 switch (c) {
12542
12543 case '%':
12544 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012546 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 len = 1;
12549 break;
12550
12551 case 's':
12552 case 'r':
12553 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012554 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 temp = v;
12556 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012557 }
12558 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 if (c == 's')
12560 temp = PyObject_Str(v);
12561 else if (c == 'r')
12562 temp = PyObject_Repr(v);
12563 else
12564 temp = PyObject_ASCII(v);
12565 if (temp == NULL)
12566 goto onError;
12567 if (PyUnicode_Check(temp))
12568 /* nothing to do */;
12569 else {
12570 Py_DECREF(temp);
12571 PyErr_SetString(PyExc_TypeError,
12572 "%s argument has non-string str()");
12573 goto onError;
12574 }
12575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 if (PyUnicode_READY(temp) == -1) {
12577 Py_CLEAR(temp);
12578 goto onError;
12579 }
12580 pbuf = PyUnicode_DATA(temp);
12581 kind = PyUnicode_KIND(temp);
12582 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 if (prec >= 0 && len > prec)
12584 len = prec;
12585 break;
12586
12587 case 'i':
12588 case 'd':
12589 case 'u':
12590 case 'o':
12591 case 'x':
12592 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 isnumok = 0;
12594 if (PyNumber_Check(v)) {
12595 PyObject *iobj=NULL;
12596
12597 if (PyLong_Check(v)) {
12598 iobj = v;
12599 Py_INCREF(iobj);
12600 }
12601 else {
12602 iobj = PyNumber_Long(v);
12603 }
12604 if (iobj!=NULL) {
12605 if (PyLong_Check(iobj)) {
12606 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012607 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012608 Py_DECREF(iobj);
12609 if (!temp)
12610 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 if (PyUnicode_READY(temp) == -1) {
12612 Py_CLEAR(temp);
12613 goto onError;
12614 }
12615 pbuf = PyUnicode_DATA(temp);
12616 kind = PyUnicode_KIND(temp);
12617 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 sign = 1;
12619 }
12620 else {
12621 Py_DECREF(iobj);
12622 }
12623 }
12624 }
12625 if (!isnumok) {
12626 PyErr_Format(PyExc_TypeError,
12627 "%%%c format: a number is required, "
12628 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12629 goto onError;
12630 }
12631 if (flags & F_ZERO)
12632 fill = '0';
12633 break;
12634
12635 case 'e':
12636 case 'E':
12637 case 'f':
12638 case 'F':
12639 case 'g':
12640 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012641 temp = formatfloat(v, flags, prec, c);
12642 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 if (PyUnicode_READY(temp) == -1) {
12645 Py_CLEAR(temp);
12646 goto onError;
12647 }
12648 pbuf = PyUnicode_DATA(temp);
12649 kind = PyUnicode_KIND(temp);
12650 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 sign = 1;
12652 if (flags & F_ZERO)
12653 fill = '0';
12654 break;
12655
12656 case 'c':
12657 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012658 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012659 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 if (len < 0)
12661 goto onError;
12662 break;
12663
12664 default:
12665 PyErr_Format(PyExc_ValueError,
12666 "unsupported format character '%c' (0x%x) "
12667 "at index %zd",
12668 (31<=c && c<=126) ? (char)c : '?',
12669 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 goto onError;
12672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 /* pbuf is initialized here. */
12674 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012675 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12677 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12678 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 len--;
12680 }
12681 else if (flags & F_SIGN)
12682 sign = '+';
12683 else if (flags & F_BLANK)
12684 sign = ' ';
12685 else
12686 sign = 0;
12687 }
12688 if (width < len)
12689 width = len;
12690 if (rescnt - (sign != 0) < width) {
12691 reslen -= rescnt;
12692 rescnt = width + fmtcnt + 100;
12693 reslen += rescnt;
12694 if (reslen < 0) {
12695 Py_XDECREF(temp);
12696 PyErr_NoMemory();
12697 goto onError;
12698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12700 if (res0 == 0) {
12701 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 Py_XDECREF(temp);
12703 goto onError;
12704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 }
12707 if (sign) {
12708 if (fill != ' ')
12709 *res++ = sign;
12710 rescnt--;
12711 if (width > len)
12712 width--;
12713 }
12714 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12716 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12719 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 }
12721 rescnt -= 2;
12722 width -= 2;
12723 if (width < 0)
12724 width = 0;
12725 len -= 2;
12726 }
12727 if (width > len && !(flags & F_LJUST)) {
12728 do {
12729 --rescnt;
12730 *res++ = fill;
12731 } while (--width > len);
12732 }
12733 if (fill == ' ') {
12734 if (sign)
12735 *res++ = sign;
12736 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12738 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12739 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12740 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012741 }
12742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 /* Copy all characters, preserving len */
12744 len1 = len;
12745 while (len1--) {
12746 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12747 rescnt--;
12748 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 while (--width >= len) {
12750 --rescnt;
12751 *res++ = ' ';
12752 }
12753 if (dict && (argidx < arglen) && c != '%') {
12754 PyErr_SetString(PyExc_TypeError,
12755 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012756 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 goto onError;
12758 }
12759 Py_XDECREF(temp);
12760 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761 } /* until end */
12762 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 PyErr_SetString(PyExc_TypeError,
12764 "not all arguments converted during string formatting");
12765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766 }
12767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768
12769 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12770 if (*res > max)
12771 max = *res;
12772 result = PyUnicode_New(reslen - rescnt, max);
12773 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 kind = PyUnicode_KIND(result);
12776 for (res = res0; res < res0+reslen-rescnt; res++)
12777 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12778 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781 }
12782 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783 return (PyObject *)result;
12784
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787 Py_DECREF(uformat);
12788 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790 }
12791 return NULL;
12792}
12793
Jeremy Hylton938ace62002-07-17 16:30:39 +000012794static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012795unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12796
Tim Peters6d6c1a32001-08-02 04:15:00 +000012797static PyObject *
12798unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12799{
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012801 static char *kwlist[] = {"object", "encoding", "errors", 0};
12802 char *encoding = NULL;
12803 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012804
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805 if (type != &PyUnicode_Type)
12806 return unicode_subtype_new(type, args, kwds);
12807 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012809 return NULL;
12810 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 if (encoding == NULL && errors == NULL)
12813 return PyObject_Str(x);
12814 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012816}
12817
Guido van Rossume023fe02001-08-30 03:12:59 +000012818static PyObject *
12819unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12820{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012821 PyUnicodeObject *unicode, *self;
12822 Py_ssize_t length, char_size;
12823 int share_wstr, share_utf8;
12824 unsigned int kind;
12825 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012826
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012828
12829 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12830 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012831 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012832 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012833 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012834 return NULL;
12835
12836 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12837 if (self == NULL) {
12838 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012839 return NULL;
12840 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012841 kind = PyUnicode_KIND(unicode);
12842 length = PyUnicode_GET_LENGTH(unicode);
12843
12844 _PyUnicode_LENGTH(self) = length;
12845 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12846 _PyUnicode_STATE(self).interned = 0;
12847 _PyUnicode_STATE(self).kind = kind;
12848 _PyUnicode_STATE(self).compact = 0;
12849 _PyUnicode_STATE(self).ascii = 0;
12850 _PyUnicode_STATE(self).ready = 1;
12851 _PyUnicode_WSTR(self) = NULL;
12852 _PyUnicode_UTF8_LENGTH(self) = 0;
12853 _PyUnicode_UTF8(self) = NULL;
12854 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012855 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012856
12857 share_utf8 = 0;
12858 share_wstr = 0;
12859 if (kind == PyUnicode_1BYTE_KIND) {
12860 char_size = 1;
12861 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12862 share_utf8 = 1;
12863 }
12864 else if (kind == PyUnicode_2BYTE_KIND) {
12865 char_size = 2;
12866 if (sizeof(wchar_t) == 2)
12867 share_wstr = 1;
12868 }
12869 else {
12870 assert(kind == PyUnicode_4BYTE_KIND);
12871 char_size = 4;
12872 if (sizeof(wchar_t) == 4)
12873 share_wstr = 1;
12874 }
12875
12876 /* Ensure we won't overflow the length. */
12877 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12878 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012880 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012881 data = PyObject_MALLOC((length + 1) * char_size);
12882 if (data == NULL) {
12883 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 goto onError;
12885 }
12886
Victor Stinnerc3c74152011-10-02 20:39:55 +020012887 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012888 if (share_utf8) {
12889 _PyUnicode_UTF8_LENGTH(self) = length;
12890 _PyUnicode_UTF8(self) = data;
12891 }
12892 if (share_wstr) {
12893 _PyUnicode_WSTR_LENGTH(self) = length;
12894 _PyUnicode_WSTR(self) = (wchar_t *)data;
12895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012897 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12898 PyUnicode_KIND_SIZE(kind, length + 1));
12899 Py_DECREF(unicode);
12900 return (PyObject *)self;
12901
12902onError:
12903 Py_DECREF(unicode);
12904 Py_DECREF(self);
12905 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012906}
12907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012908PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012910\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012911Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012912encoding defaults to the current default string encoding.\n\
12913errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012914
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012915static PyObject *unicode_iter(PyObject *seq);
12916
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012918 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012919 "str", /* tp_name */
12920 sizeof(PyUnicodeObject), /* tp_size */
12921 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012923 (destructor)unicode_dealloc, /* tp_dealloc */
12924 0, /* tp_print */
12925 0, /* tp_getattr */
12926 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012927 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012928 unicode_repr, /* tp_repr */
12929 &unicode_as_number, /* tp_as_number */
12930 &unicode_as_sequence, /* tp_as_sequence */
12931 &unicode_as_mapping, /* tp_as_mapping */
12932 (hashfunc) unicode_hash, /* tp_hash*/
12933 0, /* tp_call*/
12934 (reprfunc) unicode_str, /* tp_str */
12935 PyObject_GenericGetAttr, /* tp_getattro */
12936 0, /* tp_setattro */
12937 0, /* tp_as_buffer */
12938 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012939 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012940 unicode_doc, /* tp_doc */
12941 0, /* tp_traverse */
12942 0, /* tp_clear */
12943 PyUnicode_RichCompare, /* tp_richcompare */
12944 0, /* tp_weaklistoffset */
12945 unicode_iter, /* tp_iter */
12946 0, /* tp_iternext */
12947 unicode_methods, /* tp_methods */
12948 0, /* tp_members */
12949 0, /* tp_getset */
12950 &PyBaseObject_Type, /* tp_base */
12951 0, /* tp_dict */
12952 0, /* tp_descr_get */
12953 0, /* tp_descr_set */
12954 0, /* tp_dictoffset */
12955 0, /* tp_init */
12956 0, /* tp_alloc */
12957 unicode_new, /* tp_new */
12958 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959};
12960
12961/* Initialize the Unicode implementation */
12962
Thomas Wouters78890102000-07-22 19:25:51 +000012963void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012965 int i;
12966
Thomas Wouters477c8d52006-05-27 19:21:47 +000012967 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969 0x000A, /* LINE FEED */
12970 0x000D, /* CARRIAGE RETURN */
12971 0x001C, /* FILE SEPARATOR */
12972 0x001D, /* GROUP SEPARATOR */
12973 0x001E, /* RECORD SEPARATOR */
12974 0x0085, /* NEXT LINE */
12975 0x2028, /* LINE SEPARATOR */
12976 0x2029, /* PARAGRAPH SEPARATOR */
12977 };
12978
Fred Drakee4315f52000-05-09 19:53:39 +000012979 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012980 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012981 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012983
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012984 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012985 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012986 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988
12989 /* initialize the linebreak bloom filter */
12990 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012992 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012993
12994 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995}
12996
12997/* Finalize the Unicode implementation */
12998
Christian Heimesa156e092008-02-16 07:38:31 +000012999int
13000PyUnicode_ClearFreeList(void)
13001{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013003}
13004
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005void
Thomas Wouters78890102000-07-22 19:25:51 +000013006_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013008 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013010 Py_XDECREF(unicode_empty);
13011 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013013 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013014 if (unicode_latin1[i]) {
13015 Py_DECREF(unicode_latin1[i]);
13016 unicode_latin1[i] = NULL;
13017 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013018 }
Christian Heimesa156e092008-02-16 07:38:31 +000013019 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013021
Walter Dörwald16807132007-05-25 13:52:07 +000013022void
13023PyUnicode_InternInPlace(PyObject **p)
13024{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013025 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13026 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013027#ifdef Py_DEBUG
13028 assert(s != NULL);
13029 assert(_PyUnicode_CHECK(s));
13030#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013031 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013032 return;
13033#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013034 /* If it's a subclass, we don't really know what putting
13035 it in the interned dict might do. */
13036 if (!PyUnicode_CheckExact(s))
13037 return;
13038 if (PyUnicode_CHECK_INTERNED(s))
13039 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013040 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013041 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 return;
13043 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013044 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013045 if (interned == NULL) {
13046 interned = PyDict_New();
13047 if (interned == NULL) {
13048 PyErr_Clear(); /* Don't leave an exception */
13049 return;
13050 }
13051 }
13052 /* It might be that the GetItem call fails even
13053 though the key is present in the dictionary,
13054 namely when this happens during a stack overflow. */
13055 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013056 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013057 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013058
Benjamin Peterson29060642009-01-31 22:14:21 +000013059 if (t) {
13060 Py_INCREF(t);
13061 Py_DECREF(*p);
13062 *p = t;
13063 return;
13064 }
Walter Dörwald16807132007-05-25 13:52:07 +000013065
Benjamin Peterson14339b62009-01-31 16:36:08 +000013066 PyThreadState_GET()->recursion_critical = 1;
13067 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13068 PyErr_Clear();
13069 PyThreadState_GET()->recursion_critical = 0;
13070 return;
13071 }
13072 PyThreadState_GET()->recursion_critical = 0;
13073 /* The two references in interned are not counted by refcnt.
13074 The deallocator will take care of this */
13075 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013077}
13078
13079void
13080PyUnicode_InternImmortal(PyObject **p)
13081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13083
Benjamin Peterson14339b62009-01-31 16:36:08 +000013084 PyUnicode_InternInPlace(p);
13085 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013087 Py_INCREF(*p);
13088 }
Walter Dörwald16807132007-05-25 13:52:07 +000013089}
13090
13091PyObject *
13092PyUnicode_InternFromString(const char *cp)
13093{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013094 PyObject *s = PyUnicode_FromString(cp);
13095 if (s == NULL)
13096 return NULL;
13097 PyUnicode_InternInPlace(&s);
13098 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013099}
13100
Alexander Belopolsky40018472011-02-26 01:02:56 +000013101void
13102_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013104 PyObject *keys;
13105 PyUnicodeObject *s;
13106 Py_ssize_t i, n;
13107 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013108
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 if (interned == NULL || !PyDict_Check(interned))
13110 return;
13111 keys = PyDict_Keys(interned);
13112 if (keys == NULL || !PyList_Check(keys)) {
13113 PyErr_Clear();
13114 return;
13115 }
Walter Dörwald16807132007-05-25 13:52:07 +000013116
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13118 detector, interned unicode strings are not forcibly deallocated;
13119 rather, we give them their stolen references back, and then clear
13120 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013121
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 n = PyList_GET_SIZE(keys);
13123 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013125 for (i = 0; i < n; i++) {
13126 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 if (PyUnicode_READY(s) == -1)
13128 fprintf(stderr, "could not ready string\n");
13129 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 case SSTATE_NOT_INTERNED:
13131 /* XXX Shouldn't happen */
13132 break;
13133 case SSTATE_INTERNED_IMMORTAL:
13134 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013136 break;
13137 case SSTATE_INTERNED_MORTAL:
13138 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 break;
13141 default:
13142 Py_FatalError("Inconsistent interned string state.");
13143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013145 }
13146 fprintf(stderr, "total size of all interned strings: "
13147 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13148 "mortal/immortal\n", mortal_size, immortal_size);
13149 Py_DECREF(keys);
13150 PyDict_Clear(interned);
13151 Py_DECREF(interned);
13152 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013153}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013154
13155
13156/********************* Unicode Iterator **************************/
13157
13158typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 PyObject_HEAD
13160 Py_ssize_t it_index;
13161 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013162} unicodeiterobject;
13163
13164static void
13165unicodeiter_dealloc(unicodeiterobject *it)
13166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013167 _PyObject_GC_UNTRACK(it);
13168 Py_XDECREF(it->it_seq);
13169 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013170}
13171
13172static int
13173unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13174{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013175 Py_VISIT(it->it_seq);
13176 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013177}
13178
13179static PyObject *
13180unicodeiter_next(unicodeiterobject *it)
13181{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013182 PyUnicodeObject *seq;
13183 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013184
Benjamin Peterson14339b62009-01-31 16:36:08 +000013185 assert(it != NULL);
13186 seq = it->it_seq;
13187 if (seq == NULL)
13188 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013189 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13192 int kind = PyUnicode_KIND(seq);
13193 void *data = PyUnicode_DATA(seq);
13194 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13195 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013196 if (item != NULL)
13197 ++it->it_index;
13198 return item;
13199 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013200
Benjamin Peterson14339b62009-01-31 16:36:08 +000013201 Py_DECREF(seq);
13202 it->it_seq = NULL;
13203 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013204}
13205
13206static PyObject *
13207unicodeiter_len(unicodeiterobject *it)
13208{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 Py_ssize_t len = 0;
13210 if (it->it_seq)
13211 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13212 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013213}
13214
13215PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13216
13217static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013218 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013219 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013220 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013221};
13222
13223PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013224 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13225 "str_iterator", /* tp_name */
13226 sizeof(unicodeiterobject), /* tp_basicsize */
13227 0, /* tp_itemsize */
13228 /* methods */
13229 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13230 0, /* tp_print */
13231 0, /* tp_getattr */
13232 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013233 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013234 0, /* tp_repr */
13235 0, /* tp_as_number */
13236 0, /* tp_as_sequence */
13237 0, /* tp_as_mapping */
13238 0, /* tp_hash */
13239 0, /* tp_call */
13240 0, /* tp_str */
13241 PyObject_GenericGetAttr, /* tp_getattro */
13242 0, /* tp_setattro */
13243 0, /* tp_as_buffer */
13244 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13245 0, /* tp_doc */
13246 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13247 0, /* tp_clear */
13248 0, /* tp_richcompare */
13249 0, /* tp_weaklistoffset */
13250 PyObject_SelfIter, /* tp_iter */
13251 (iternextfunc)unicodeiter_next, /* tp_iternext */
13252 unicodeiter_methods, /* tp_methods */
13253 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013254};
13255
13256static PyObject *
13257unicode_iter(PyObject *seq)
13258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013259 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013260
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 if (!PyUnicode_Check(seq)) {
13262 PyErr_BadInternalCall();
13263 return NULL;
13264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 if (PyUnicode_READY(seq) == -1)
13266 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13268 if (it == NULL)
13269 return NULL;
13270 it->it_index = 0;
13271 Py_INCREF(seq);
13272 it->it_seq = (PyUnicodeObject *)seq;
13273 _PyObject_GC_TRACK(it);
13274 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013275}
13276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277#define UNIOP(x) Py_UNICODE_##x
13278#define UNIOP_t Py_UNICODE
13279#include "uniops.h"
13280#undef UNIOP
13281#undef UNIOP_t
13282#define UNIOP(x) Py_UCS4_##x
13283#define UNIOP_t Py_UCS4
13284#include "uniops.h"
13285#undef UNIOP
13286#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013287
Victor Stinner71133ff2010-09-01 23:43:53 +000013288Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013289PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013290{
13291 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13292 Py_UNICODE *copy;
13293 Py_ssize_t size;
13294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 if (!PyUnicode_Check(unicode)) {
13296 PyErr_BadArgument();
13297 return NULL;
13298 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013299 /* Ensure we won't overflow the size. */
13300 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13301 PyErr_NoMemory();
13302 return NULL;
13303 }
13304 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13305 size *= sizeof(Py_UNICODE);
13306 copy = PyMem_Malloc(size);
13307 if (copy == NULL) {
13308 PyErr_NoMemory();
13309 return NULL;
13310 }
13311 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13312 return copy;
13313}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013314
Georg Brandl66c221e2010-10-14 07:04:07 +000013315/* A _string module, to export formatter_parser and formatter_field_name_split
13316 to the string.Formatter class implemented in Python. */
13317
13318static PyMethodDef _string_methods[] = {
13319 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13320 METH_O, PyDoc_STR("split the argument as a field name")},
13321 {"formatter_parser", (PyCFunction) formatter_parser,
13322 METH_O, PyDoc_STR("parse the argument as a format string")},
13323 {NULL, NULL}
13324};
13325
13326static struct PyModuleDef _string_module = {
13327 PyModuleDef_HEAD_INIT,
13328 "_string",
13329 PyDoc_STR("string helper module"),
13330 0,
13331 _string_methods,
13332 NULL,
13333 NULL,
13334 NULL,
13335 NULL
13336};
13337
13338PyMODINIT_FUNC
13339PyInit__string(void)
13340{
13341 return PyModule_Create(&_string_module);
13342}
13343
13344
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013345#ifdef __cplusplus
13346}
13347#endif