blob: 77cc0820f911b58c0c51d36343761fd4ed5d7c0f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner03490912011-10-03 23:45:12 +0200154/* true if the Unicode object has an allocated wstr memory block
155 (not shared with other data) */
156#define _PyUnicode_HAS_WSTR_MEMORY(op) \
157 (assert(_PyUnicode_CHECK(op)), \
158 (_PyUnicode_WSTR(op) && \
159 (!PyUnicode_IS_READY(op) || \
160 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
161
Victor Stinner910337b2011-10-03 03:20:16 +0200162/* Generic helper macro to convert characters of different types.
163 from_type and to_type have to be valid type names, begin and end
164 are pointers to the source characters which should be of type
165 "from_type *". to is a pointer of type "to_type *" and points to the
166 buffer where the result characters are written to. */
167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
168 do { \
169 const from_type *iter_; to_type *to_; \
170 for (iter_ = (begin), to_ = (to_type *)(to); \
171 iter_ < (end); \
172 ++iter_, ++to_) { \
173 *to_ = (to_type)*iter_; \
174 } \
175 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200176
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200177/* The Unicode string has been modified: reset the hash */
178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
179
Walter Dörwald16807132007-05-25 13:52:07 +0000180/* This dictionary holds all interned unicode strings. Note that references
181 to strings in this dictionary are *not* counted in the string's ob_refcnt.
182 When the interned string reaches a refcnt of 0 the string deallocation
183 function will delete the reference from this dictionary.
184
185 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000186 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000187*/
188static PyObject *interned;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200191static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
193/* Single character Unicode strings in the Latin-1 range are being
194 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200195static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Christian Heimes190d79e2008-01-30 11:58:22 +0000197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000200/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000202/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x000C: * FORM FEED */
204/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 1, 1, 1, 1, 1, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000207/* case 0x001C: * FILE SEPARATOR */
208/* case 0x001D: * GROUP SEPARATOR */
209/* case 0x001E: * RECORD SEPARATOR */
210/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000211 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000213 1, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000217
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000226};
227
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200228/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200230static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200231
Alexander Belopolsky40018472011-02-26 01:02:56 +0000232static PyObject *
233unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000234 PyObject **errorHandler,const char *encoding, const char *reason,
235 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
236 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
237
Alexander Belopolsky40018472011-02-26 01:02:56 +0000238static void
239raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300240 const char *encoding,
241 const Py_UNICODE *unicode, Py_ssize_t size,
242 Py_ssize_t startpos, Py_ssize_t endpos,
243 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000244
Christian Heimes190d79e2008-01-30 11:58:22 +0000245/* Same for linebreaks */
246static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000247 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000248/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000249/* 0x000B, * LINE TABULATION */
250/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000251/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000252 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x001C, * FILE SEPARATOR */
255/* 0x001D, * GROUP SEPARATOR */
256/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 1, 1, 1, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000262
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000271};
272
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300273/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
274 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000275Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000276PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000277{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000278#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000280#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 /* This is actually an illegal character, so it should
282 not be passed to unichr. */
283 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#endif
285}
286
Victor Stinner910337b2011-10-03 03:20:16 +0200287#ifdef Py_DEBUG
288static int
289_PyUnicode_CheckConsistency(void *op)
290{
291 PyASCIIObject *ascii;
292 unsigned int kind;
293
294 assert(PyUnicode_Check(op));
295
296 ascii = (PyASCIIObject *)op;
297 kind = ascii->state.kind;
298
Victor Stinnera3b334d2011-10-03 13:53:37 +0200299 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200300 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200301 assert(ascii->state.ready == 1);
302 }
303 else if (ascii->state.compact == 1) {
Victor Stinner85041a52011-10-03 14:42:39 +0200304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(kind == PyUnicode_1BYTE_KIND
306 || kind == PyUnicode_2BYTE_KIND
307 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ascii == 0);
309 assert(ascii->state.ready == 1);
Victor Stinner85041a52011-10-03 14:42:39 +0200310 assert (compact->utf8 != (void*)(compact + 1));
Victor Stinner910337b2011-10-03 03:20:16 +0200311 } else {
312 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
313 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
314
315 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnera3b334d2011-10-03 13:53:37 +0200316 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ascii == 0);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200318 assert(ascii->state.ready == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->wstr != NULL);
320 assert(unicode->data.any == NULL);
321 assert(compact->utf8 == NULL);
322 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
323 }
324 else {
325 assert(kind == PyUnicode_1BYTE_KIND
326 || kind == PyUnicode_2BYTE_KIND
327 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200328 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(ascii->state.ready == 1);
330 assert(unicode->data.any != NULL);
Victor Stinner85041a52011-10-03 14:42:39 +0200331 if (ascii->state.ascii)
332 assert (compact->utf8 == unicode->data.any);
333 else
334 assert (compact->utf8 != unicode->data.any);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 }
336 }
337 return 1;
338}
339#endif
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341/* --- Bloom Filters ----------------------------------------------------- */
342
343/* stuff to implement simple "bloom filters" for Unicode characters.
344 to keep things simple, we use a single bitmask, using the least 5
345 bits from each unicode characters as the bit index. */
346
347/* the linebreak mask is set up by Unicode_Init below */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349#if LONG_BIT >= 128
350#define BLOOM_WIDTH 128
351#elif LONG_BIT >= 64
352#define BLOOM_WIDTH 64
353#elif LONG_BIT >= 32
354#define BLOOM_WIDTH 32
355#else
356#error "LONG_BIT is smaller than 32"
357#endif
358
Thomas Wouters477c8d52006-05-27 19:21:47 +0000359#define BLOOM_MASK unsigned long
360
361static BLOOM_MASK bloom_linebreak;
362
Antoine Pitrouf068f942010-01-13 14:19:12 +0000363#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
364#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000365
Benjamin Peterson29060642009-01-31 22:14:21 +0000366#define BLOOM_LINEBREAK(ch) \
367 ((ch) < 128U ? ascii_linebreak[(ch)] : \
368 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000369
Alexander Belopolsky40018472011-02-26 01:02:56 +0000370Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200371make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000372{
373 /* calculate simple bloom-style bitmask for a given unicode string */
374
Antoine Pitrouf068f942010-01-13 14:19:12 +0000375 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000376 Py_ssize_t i;
377
378 mask = 0;
379 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200380 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000381
382 return mask;
383}
384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200385#define BLOOM_MEMBER(mask, chr, str) \
386 (BLOOM(mask, chr) \
387 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000388
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389/* --- Unicode Object ----------------------------------------------------- */
390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200391static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200392fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
393
394Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
395 Py_ssize_t size, Py_UCS4 ch,
396 int direction)
397{
398 /* like wcschr, but doesn't stop at NULL characters */
399 Py_ssize_t i;
400 if (direction == 1) {
401 for(i = 0; i < size; i++)
402 if (PyUnicode_READ(kind, s, i) == ch)
403 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
404 }
405 else {
406 for(i = size-1; i >= 0; i--)
407 if (PyUnicode_READ(kind, s, i) == ch)
408 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
409 }
410 return NULL;
411}
412
Victor Stinnerfe226c02011-10-03 03:52:20 +0200413static PyObject*
414resize_compact(PyObject *unicode, Py_ssize_t length)
415{
416 Py_ssize_t char_size;
417 Py_ssize_t struct_size;
418 Py_ssize_t new_size;
419 int share_wstr;
420
421 assert(PyUnicode_IS_READY(unicode));
422 char_size = PyUnicode_CHARACTER_SIZE(unicode);
423 if (PyUnicode_IS_COMPACT_ASCII(unicode))
424 struct_size = sizeof(PyASCIIObject);
425 else
426 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200427 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428
429 _Py_DEC_REFTOTAL;
430 _Py_ForgetReference(unicode);
431
432 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
433 PyErr_NoMemory();
434 return NULL;
435 }
436 new_size = (struct_size + (length + 1) * char_size);
437
438 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
439 if (unicode == NULL) {
440 PyObject_Del(unicode);
441 PyErr_NoMemory();
442 return NULL;
443 }
444 _Py_NewReference(unicode);
445 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200446 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200447 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200448 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
449 _PyUnicode_WSTR_LENGTH(unicode) = length;
450 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200451 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
452 length, 0);
453 return unicode;
454}
455
Alexander Belopolsky40018472011-02-26 01:02:56 +0000456static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200457resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458{
459 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200461 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462
Victor Stinnerfe226c02011-10-03 03:52:20 +0200463 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200464 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000465
Victor Stinnerfe226c02011-10-03 03:52:20 +0200466 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
467 {
468 PyObject_DEL(_PyUnicode_UTF8(unicode));
469 _PyUnicode_UTF8(unicode) = NULL;
470 }
471
472 if (PyUnicode_IS_READY(unicode)) {
473 Py_ssize_t char_size;
474 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200475 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200476 void *data;
477
478 data = _PyUnicode_DATA_ANY(unicode);
479 assert(data != NULL);
480 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200481 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
482 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200483
484 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
485 PyErr_NoMemory();
486 return -1;
487 }
488 new_size = (length + 1) * char_size;
489
490 data = (PyObject *)PyObject_REALLOC(data, new_size);
491 if (data == NULL) {
492 PyErr_NoMemory();
493 return -1;
494 }
495 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200496 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200497 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200498 _PyUnicode_WSTR_LENGTH(unicode) = length;
499 }
500 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200501 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200502 _PyUnicode_UTF8_LENGTH(unicode) = length;
503 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200504 _PyUnicode_LENGTH(unicode) = length;
505 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
506 if (share_wstr)
507 return 0;
508 }
509 if (_PyUnicode_WSTR(unicode) != NULL) {
510 assert(_PyUnicode_WSTR(unicode) != NULL);
511
512 oldstr = _PyUnicode_WSTR(unicode);
513 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
514 sizeof(Py_UNICODE) * (length + 1));
515 if (!_PyUnicode_WSTR(unicode)) {
516 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
517 PyErr_NoMemory();
518 return -1;
519 }
520 _PyUnicode_WSTR(unicode)[length] = 0;
521 _PyUnicode_WSTR_LENGTH(unicode) = length;
522 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000523 return 0;
524}
525
Victor Stinnerfe226c02011-10-03 03:52:20 +0200526static PyObject*
527resize_copy(PyObject *unicode, Py_ssize_t length)
528{
529 Py_ssize_t copy_length;
530 if (PyUnicode_IS_COMPACT(unicode)) {
531 PyObject *copy;
532 assert(PyUnicode_IS_READY(unicode));
533
534 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
535 if (copy == NULL)
536 return NULL;
537
538 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
539 if (PyUnicode_CopyCharacters(copy, 0,
540 unicode, 0,
541 copy_length) < 0)
542 {
543 Py_DECREF(copy);
544 return NULL;
545 }
546 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200547 }
548 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200549 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200550 assert(_PyUnicode_WSTR(unicode) != NULL);
551 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200552 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200553 if (w == NULL)
554 return NULL;
555 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
556 copy_length = Py_MIN(copy_length, length);
557 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
558 copy_length);
559 return (PyObject*)w;
560 }
561}
562
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000564 Ux0000 terminated; some code (e.g. new_identifier)
565 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566
567 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000568 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000569
570*/
571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200572#ifdef Py_DEBUG
573int unicode_old_new_calls = 0;
574#endif
575
Alexander Belopolsky40018472011-02-26 01:02:56 +0000576static PyUnicodeObject *
577_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578{
579 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200580 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000581
Thomas Wouters477c8d52006-05-27 19:21:47 +0000582 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000583 if (length == 0 && unicode_empty != NULL) {
584 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200585 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586 }
587
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000588 /* Ensure we won't overflow the size. */
589 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
590 return (PyUnicodeObject *)PyErr_NoMemory();
591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592 if (length < 0) {
593 PyErr_SetString(PyExc_SystemError,
594 "Negative size passed to _PyUnicode_New");
595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 }
597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598#ifdef Py_DEBUG
599 ++unicode_old_new_calls;
600#endif
601
602 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
603 if (unicode == NULL)
604 return NULL;
605 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
606 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
607 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000608 PyErr_NoMemory();
609 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Jeremy Hyltond8082792003-09-16 19:41:39 +0000612 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000613 * the caller fails before initializing str -- unicode_resize()
614 * reads str[0], and the Keep-Alive optimization can keep memory
615 * allocated for str alive across a call to unicode_dealloc(unicode).
616 * We don't want unicode_resize to read uninitialized memory in
617 * that case.
618 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200619 _PyUnicode_WSTR(unicode)[0] = 0;
620 _PyUnicode_WSTR(unicode)[length] = 0;
621 _PyUnicode_WSTR_LENGTH(unicode) = length;
622 _PyUnicode_HASH(unicode) = -1;
623 _PyUnicode_STATE(unicode).interned = 0;
624 _PyUnicode_STATE(unicode).kind = 0;
625 _PyUnicode_STATE(unicode).compact = 0;
626 _PyUnicode_STATE(unicode).ready = 0;
627 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200628 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200630 _PyUnicode_UTF8(unicode) = NULL;
631 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000633
Benjamin Peterson29060642009-01-31 22:14:21 +0000634 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000635 /* XXX UNREF/NEWREF interface should be more symmetrical */
636 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000637 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000638 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640}
641
Victor Stinnerf42dc442011-10-02 23:33:16 +0200642static const char*
643unicode_kind_name(PyObject *unicode)
644{
Victor Stinner42dfd712011-10-03 14:41:45 +0200645 /* don't check consistency: unicode_kind_name() is called from
646 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200647 if (!PyUnicode_IS_COMPACT(unicode))
648 {
649 if (!PyUnicode_IS_READY(unicode))
650 return "wstr";
651 switch(PyUnicode_KIND(unicode))
652 {
653 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200654 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200655 return "legacy ascii";
656 else
657 return "legacy latin1";
658 case PyUnicode_2BYTE_KIND:
659 return "legacy UCS2";
660 case PyUnicode_4BYTE_KIND:
661 return "legacy UCS4";
662 default:
663 return "<legacy invalid kind>";
664 }
665 }
666 assert(PyUnicode_IS_READY(unicode));
667 switch(PyUnicode_KIND(unicode))
668 {
669 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200670 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200671 return "ascii";
672 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200673 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200674 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200675 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200676 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200677 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200678 default:
679 return "<invalid compact kind>";
680 }
681}
682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200683#ifdef Py_DEBUG
684int unicode_new_new_calls = 0;
685
686/* Functions wrapping macros for use in debugger */
687char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200688 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200689}
690
691void *_PyUnicode_compact_data(void *unicode) {
692 return _PyUnicode_COMPACT_DATA(unicode);
693}
694void *_PyUnicode_data(void *unicode){
695 printf("obj %p\n", unicode);
696 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
697 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
698 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
699 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
700 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
701 return PyUnicode_DATA(unicode);
702}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200703
704void
705_PyUnicode_Dump(PyObject *op)
706{
707 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200708 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
709 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
710 void *data;
711 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
712 if (ascii->state.compact)
713 data = (compact + 1);
714 else
715 data = unicode->data.any;
716 if (ascii->wstr == data)
717 printf("shared ");
718 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200719 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200720 printf(" (%zu), ", compact->wstr_length);
721 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
722 printf("shared ");
723 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200724 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200725 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200726}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727#endif
728
729PyObject *
730PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
731{
732 PyObject *obj;
733 PyCompactUnicodeObject *unicode;
734 void *data;
735 int kind_state;
736 int is_sharing = 0, is_ascii = 0;
737 Py_ssize_t char_size;
738 Py_ssize_t struct_size;
739
740 /* Optimization for empty strings */
741 if (size == 0 && unicode_empty != NULL) {
742 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200743 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 }
745
746#ifdef Py_DEBUG
747 ++unicode_new_new_calls;
748#endif
749
750 struct_size = sizeof(PyCompactUnicodeObject);
751 if (maxchar < 128) {
752 kind_state = PyUnicode_1BYTE_KIND;
753 char_size = 1;
754 is_ascii = 1;
755 struct_size = sizeof(PyASCIIObject);
756 }
757 else if (maxchar < 256) {
758 kind_state = PyUnicode_1BYTE_KIND;
759 char_size = 1;
760 }
761 else if (maxchar < 65536) {
762 kind_state = PyUnicode_2BYTE_KIND;
763 char_size = 2;
764 if (sizeof(wchar_t) == 2)
765 is_sharing = 1;
766 }
767 else {
768 kind_state = PyUnicode_4BYTE_KIND;
769 char_size = 4;
770 if (sizeof(wchar_t) == 4)
771 is_sharing = 1;
772 }
773
774 /* Ensure we won't overflow the size. */
775 if (size < 0) {
776 PyErr_SetString(PyExc_SystemError,
777 "Negative size passed to PyUnicode_New");
778 return NULL;
779 }
780 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
781 return PyErr_NoMemory();
782
783 /* Duplicated allocation code from _PyObject_New() instead of a call to
784 * PyObject_New() so we are able to allocate space for the object and
785 * it's data buffer.
786 */
787 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
788 if (obj == NULL)
789 return PyErr_NoMemory();
790 obj = PyObject_INIT(obj, &PyUnicode_Type);
791 if (obj == NULL)
792 return NULL;
793
794 unicode = (PyCompactUnicodeObject *)obj;
795 if (is_ascii)
796 data = ((PyASCIIObject*)obj) + 1;
797 else
798 data = unicode + 1;
799 _PyUnicode_LENGTH(unicode) = size;
800 _PyUnicode_HASH(unicode) = -1;
801 _PyUnicode_STATE(unicode).interned = 0;
802 _PyUnicode_STATE(unicode).kind = kind_state;
803 _PyUnicode_STATE(unicode).compact = 1;
804 _PyUnicode_STATE(unicode).ready = 1;
805 _PyUnicode_STATE(unicode).ascii = is_ascii;
806 if (is_ascii) {
807 ((char*)data)[size] = 0;
808 _PyUnicode_WSTR(unicode) = NULL;
809 }
810 else if (kind_state == PyUnicode_1BYTE_KIND) {
811 ((char*)data)[size] = 0;
812 _PyUnicode_WSTR(unicode) = NULL;
813 _PyUnicode_WSTR_LENGTH(unicode) = 0;
814 unicode->utf8_length = 0;
815 unicode->utf8 = NULL;
816 }
817 else {
818 unicode->utf8 = NULL;
819 if (kind_state == PyUnicode_2BYTE_KIND)
820 ((Py_UCS2*)data)[size] = 0;
821 else /* kind_state == PyUnicode_4BYTE_KIND */
822 ((Py_UCS4*)data)[size] = 0;
823 if (is_sharing) {
824 _PyUnicode_WSTR_LENGTH(unicode) = size;
825 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
826 }
827 else {
828 _PyUnicode_WSTR_LENGTH(unicode) = 0;
829 _PyUnicode_WSTR(unicode) = NULL;
830 }
831 }
832 return obj;
833}
834
835#if SIZEOF_WCHAR_T == 2
836/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
837 will decode surrogate pairs, the other conversions are implemented as macros
838 for efficency.
839
840 This function assumes that unicode can hold one more code point than wstr
841 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200842static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
844 PyUnicodeObject *unicode)
845{
846 const wchar_t *iter;
847 Py_UCS4 *ucs4_out;
848
Victor Stinner910337b2011-10-03 03:20:16 +0200849 assert(unicode != NULL);
850 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
852 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
853
854 for (iter = begin; iter < end; ) {
855 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
856 _PyUnicode_GET_LENGTH(unicode)));
857 if (*iter >= 0xD800 && *iter <= 0xDBFF
858 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
859 {
860 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
861 iter += 2;
862 }
863 else {
864 *ucs4_out++ = *iter;
865 iter++;
866 }
867 }
868 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
869 _PyUnicode_GET_LENGTH(unicode)));
870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200871}
872#endif
873
Victor Stinnercd9950f2011-10-02 00:34:53 +0200874static int
875_PyUnicode_Dirty(PyObject *unicode)
876{
Victor Stinner910337b2011-10-03 03:20:16 +0200877 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200878 if (Py_REFCNT(unicode) != 1) {
879 PyErr_SetString(PyExc_ValueError,
880 "Cannot modify a string having more than 1 reference");
881 return -1;
882 }
883 _PyUnicode_DIRTY(unicode);
884 return 0;
885}
886
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200887Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
889 PyObject *from, Py_ssize_t from_start,
890 Py_ssize_t how_many)
891{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200892 unsigned int from_kind, to_kind;
893 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894
Victor Stinnerb1536152011-09-30 02:26:10 +0200895 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
896 PyErr_BadInternalCall();
897 return -1;
898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899
900 if (PyUnicode_READY(from))
901 return -1;
902 if (PyUnicode_READY(to))
903 return -1;
904
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200905 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200906 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
907 PyErr_Format(PyExc_ValueError,
908 "Cannot write %zi characters at %zi "
909 "in a string of %zi characters",
910 how_many, to_start, PyUnicode_GET_LENGTH(to));
911 return -1;
912 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200913 if (how_many == 0)
914 return 0;
915
Victor Stinnercd9950f2011-10-02 00:34:53 +0200916 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200917 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200920 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200922 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Victor Stinnerf42dc442011-10-02 23:33:16 +0200924 if (from_kind == to_kind
925 /* deny latin1 => ascii */
926 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
927 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200928 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200929 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200930 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200931 + PyUnicode_KIND_SIZE(from_kind, from_start),
932 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200933 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200934 else if (from_kind == PyUnicode_1BYTE_KIND
935 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200936 {
937 _PyUnicode_CONVERT_BYTES(
938 Py_UCS1, Py_UCS2,
939 PyUnicode_1BYTE_DATA(from) + from_start,
940 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
941 PyUnicode_2BYTE_DATA(to) + to_start
942 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200943 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200944 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200945 && to_kind == PyUnicode_4BYTE_KIND)
946 {
947 _PyUnicode_CONVERT_BYTES(
948 Py_UCS1, Py_UCS4,
949 PyUnicode_1BYTE_DATA(from) + from_start,
950 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
951 PyUnicode_4BYTE_DATA(to) + to_start
952 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200953 }
954 else if (from_kind == PyUnicode_2BYTE_KIND
955 && to_kind == PyUnicode_4BYTE_KIND)
956 {
957 _PyUnicode_CONVERT_BYTES(
958 Py_UCS2, Py_UCS4,
959 PyUnicode_2BYTE_DATA(from) + from_start,
960 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
961 PyUnicode_4BYTE_DATA(to) + to_start
962 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200963 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200964 else {
965 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966
967 /* check if max_char(from substring) <= max_char(to) */
968 if (from_kind > to_kind
969 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 /* slow path to check for character overflow */
975 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
976 Py_UCS4 ch, maxchar;
977 Py_ssize_t i;
978
979 maxchar = 0;
980 invalid_kinds = 0;
981 for (i=0; i < how_many; i++) {
982 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
983 if (ch > maxchar) {
984 maxchar = ch;
985 if (maxchar > to_maxchar) {
986 invalid_kinds = 1;
987 break;
988 }
989 }
990 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
991 }
992 }
993 else
994 invalid_kinds = 1;
995 if (invalid_kinds) {
996 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200997 "Cannot copy %s characters "
998 "into a string of %s characters",
999 unicode_kind_name(from),
1000 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001001 return -1;
1002 }
1003 }
1004 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005}
1006
Victor Stinner17222162011-09-28 22:15:37 +02001007/* Find the maximum code point and count the number of surrogate pairs so a
1008 correct string length can be computed before converting a string to UCS4.
1009 This function counts single surrogates as a character and not as a pair.
1010
1011 Return 0 on success, or -1 on error. */
1012static int
1013find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1014 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015{
1016 const wchar_t *iter;
1017
Victor Stinnerc53be962011-10-02 21:33:54 +02001018 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019 if (num_surrogates == NULL || maxchar == NULL) {
1020 PyErr_SetString(PyExc_SystemError,
1021 "unexpected NULL arguments to "
1022 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1023 return -1;
1024 }
1025
1026 *num_surrogates = 0;
1027 *maxchar = 0;
1028
1029 for (iter = begin; iter < end; ) {
1030 if (*iter > *maxchar)
1031 *maxchar = *iter;
1032#if SIZEOF_WCHAR_T == 2
1033 if (*iter >= 0xD800 && *iter <= 0xDBFF
1034 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1035 {
1036 Py_UCS4 surrogate_val;
1037 surrogate_val = (((iter[0] & 0x3FF)<<10)
1038 | (iter[1] & 0x3FF)) + 0x10000;
1039 ++(*num_surrogates);
1040 if (surrogate_val > *maxchar)
1041 *maxchar = surrogate_val;
1042 iter += 2;
1043 }
1044 else
1045 iter++;
1046#else
1047 iter++;
1048#endif
1049 }
1050 return 0;
1051}
1052
1053#ifdef Py_DEBUG
1054int unicode_ready_calls = 0;
1055#endif
1056
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001057static int
1058unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001060 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 wchar_t *end;
1062 Py_UCS4 maxchar = 0;
1063 Py_ssize_t num_surrogates;
1064#if SIZEOF_WCHAR_T == 2
1065 Py_ssize_t length_wo_surrogates;
1066#endif
1067
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001068 assert(p_obj != NULL);
1069 unicode = (PyUnicodeObject *)*p_obj;
1070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001072 strings were created using _PyObject_New() and where no canonical
1073 representation (the str field) has been set yet aka strings
1074 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001075 assert(_PyUnicode_CHECK(unicode));
1076 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001078 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001079 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001080 /* Actually, it should neither be interned nor be anything else: */
1081 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082
1083#ifdef Py_DEBUG
1084 ++unicode_ready_calls;
1085#endif
1086
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001087#ifdef Py_DEBUG
1088 assert(!replace || Py_REFCNT(unicode) == 1);
1089#else
1090 if (replace && Py_REFCNT(unicode) != 1)
1091 replace = 0;
1092#endif
1093 if (replace) {
1094 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1095 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1096 /* Optimization for empty strings */
1097 if (len == 0) {
1098 Py_INCREF(unicode_empty);
1099 Py_DECREF(*p_obj);
1100 *p_obj = unicode_empty;
1101 return 0;
1102 }
1103 if (len == 1 && wstr[0] < 256) {
1104 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1105 if (latin1_char == NULL)
1106 return -1;
1107 Py_DECREF(*p_obj);
1108 *p_obj = latin1_char;
1109 return 0;
1110 }
1111 }
1112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001114 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001115 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117
1118 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001119 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1120 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 PyErr_NoMemory();
1122 return -1;
1123 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001124 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 _PyUnicode_WSTR(unicode), end,
1126 PyUnicode_1BYTE_DATA(unicode));
1127 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1128 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1129 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1130 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001131 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001132 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001133 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 }
1135 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001136 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001137 _PyUnicode_UTF8(unicode) = NULL;
1138 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 }
1140 PyObject_FREE(_PyUnicode_WSTR(unicode));
1141 _PyUnicode_WSTR(unicode) = NULL;
1142 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1143 }
1144 /* In this case we might have to convert down from 4-byte native
1145 wchar_t to 2-byte unicode. */
1146 else if (maxchar < 65536) {
1147 assert(num_surrogates == 0 &&
1148 "FindMaxCharAndNumSurrogatePairs() messed up");
1149
Victor Stinner506f5922011-09-28 22:34:18 +02001150#if SIZEOF_WCHAR_T == 2
1151 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001152 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001153 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1154 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1155 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001156 _PyUnicode_UTF8(unicode) = NULL;
1157 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001158#else
1159 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001160 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001161 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001162 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001163 PyErr_NoMemory();
1164 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 }
Victor Stinner506f5922011-09-28 22:34:18 +02001166 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1167 _PyUnicode_WSTR(unicode), end,
1168 PyUnicode_2BYTE_DATA(unicode));
1169 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1170 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1171 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001172 _PyUnicode_UTF8(unicode) = NULL;
1173 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001174 PyObject_FREE(_PyUnicode_WSTR(unicode));
1175 _PyUnicode_WSTR(unicode) = NULL;
1176 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1177#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 }
1179 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1180 else {
1181#if SIZEOF_WCHAR_T == 2
1182 /* in case the native representation is 2-bytes, we need to allocate a
1183 new normalized 4-byte version. */
1184 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001185 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1186 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001187 PyErr_NoMemory();
1188 return -1;
1189 }
1190 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1191 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001192 _PyUnicode_UTF8(unicode) = NULL;
1193 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001194 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1195 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001196 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197 PyObject_FREE(_PyUnicode_WSTR(unicode));
1198 _PyUnicode_WSTR(unicode) = NULL;
1199 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1200#else
1201 assert(num_surrogates == 0);
1202
Victor Stinnerc3c74152011-10-02 20:39:55 +02001203 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001205 _PyUnicode_UTF8(unicode) = NULL;
1206 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1208#endif
1209 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1210 }
1211 _PyUnicode_STATE(unicode).ready = 1;
1212 return 0;
1213}
1214
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001215int
1216_PyUnicode_ReadyReplace(PyObject **op)
1217{
1218 return unicode_ready(op, 1);
1219}
1220
1221int
1222_PyUnicode_Ready(PyObject *op)
1223{
1224 return unicode_ready(&op, 0);
1225}
1226
Alexander Belopolsky40018472011-02-26 01:02:56 +00001227static void
1228unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229{
Walter Dörwald16807132007-05-25 13:52:07 +00001230 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001231 case SSTATE_NOT_INTERNED:
1232 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001233
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 case SSTATE_INTERNED_MORTAL:
1235 /* revive dead object temporarily for DelItem */
1236 Py_REFCNT(unicode) = 3;
1237 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1238 Py_FatalError(
1239 "deletion of interned string failed");
1240 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001241
Benjamin Peterson29060642009-01-31 22:14:21 +00001242 case SSTATE_INTERNED_IMMORTAL:
1243 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001244
Benjamin Peterson29060642009-01-31 22:14:21 +00001245 default:
1246 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001247 }
1248
Victor Stinner03490912011-10-03 23:45:12 +02001249 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001251 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001252 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253
1254 if (PyUnicode_IS_COMPACT(unicode)) {
1255 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 }
1257 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001258 if (_PyUnicode_DATA_ANY(unicode))
1259 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001260 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 }
1262}
1263
Alexander Belopolsky40018472011-02-26 01:02:56 +00001264static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001265unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001266{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001267 if (Py_REFCNT(unicode) != 1)
1268 return 0;
1269 if (PyUnicode_CHECK_INTERNED(unicode))
1270 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001271 assert (unicode != unicode_empty);
1272#ifdef Py_DEBUG
1273 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1274 && PyUnicode_GET_LENGTH(unicode) == 1)
1275 {
1276 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001277 if (ch < 256 && unicode_latin1[ch] == unicode)
1278 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001279 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001280#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001281 return 1;
1282}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001283
Victor Stinnerfe226c02011-10-03 03:52:20 +02001284static int
1285unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1286{
1287 PyObject *unicode;
1288 Py_ssize_t old_length;
1289
1290 assert(p_unicode != NULL);
1291 unicode = *p_unicode;
1292
1293 assert(unicode != NULL);
1294 assert(PyUnicode_Check(unicode));
1295 assert(0 <= length);
1296
Victor Stinner910337b2011-10-03 03:20:16 +02001297 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001298 old_length = PyUnicode_WSTR_LENGTH(unicode);
1299 else
1300 old_length = PyUnicode_GET_LENGTH(unicode);
1301 if (old_length == length)
1302 return 0;
1303
Victor Stinnerfe226c02011-10-03 03:52:20 +02001304 if (!unicode_resizable(unicode)) {
1305 PyObject *copy = resize_copy(unicode, length);
1306 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001307 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001308 Py_DECREF(*p_unicode);
1309 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001310 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001311 }
1312
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (PyUnicode_IS_COMPACT(unicode)) {
1314 *p_unicode = resize_compact(unicode, length);
1315 if (*p_unicode == NULL)
1316 return -1;
1317 return 0;
1318 } else
1319 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001320}
1321
Alexander Belopolsky40018472011-02-26 01:02:56 +00001322int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001324{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001325 PyObject *unicode;
1326 if (p_unicode == NULL) {
1327 PyErr_BadInternalCall();
1328 return -1;
1329 }
1330 unicode = *p_unicode;
1331 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1332 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1333 {
1334 PyErr_BadInternalCall();
1335 return -1;
1336 }
1337 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001338}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340static PyObject*
1341get_latin1_char(unsigned char ch)
1342{
Victor Stinnera464fc12011-10-02 20:39:30 +02001343 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001345 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 if (!unicode)
1347 return NULL;
1348 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1349 unicode_latin1[ch] = unicode;
1350 }
1351 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001352 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353}
1354
Alexander Belopolsky40018472011-02-26 01:02:56 +00001355PyObject *
1356PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357{
1358 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359 Py_UCS4 maxchar = 0;
1360 Py_ssize_t num_surrogates;
1361
1362 if (u == NULL)
1363 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001365 /* If the Unicode data is known at construction time, we can apply
1366 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 /* Optimization for empty strings */
1369 if (size == 0 && unicode_empty != NULL) {
1370 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001371 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001372 }
Tim Petersced69f82003-09-16 20:30:58 +00001373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 /* Single character Unicode objects in the Latin-1 range are
1375 shared when using this constructor */
1376 if (size == 1 && *u < 256)
1377 return get_latin1_char((unsigned char)*u);
1378
1379 /* If not empty and not single character, copy the Unicode data
1380 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001381 if (find_maxchar_surrogates(u, u + size,
1382 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 return NULL;
1384
1385 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1386 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 if (!unicode)
1388 return NULL;
1389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 switch (PyUnicode_KIND(unicode)) {
1391 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001392 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1394 break;
1395 case PyUnicode_2BYTE_KIND:
1396#if Py_UNICODE_SIZE == 2
1397 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1398#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001399 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1401#endif
1402 break;
1403 case PyUnicode_4BYTE_KIND:
1404#if SIZEOF_WCHAR_T == 2
1405 /* This is the only case which has to process surrogates, thus
1406 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001407 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408#else
1409 assert(num_surrogates == 0);
1410 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1411#endif
1412 break;
1413 default:
1414 assert(0 && "Impossible state");
1415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 return (PyObject *)unicode;
1418}
1419
Alexander Belopolsky40018472011-02-26 01:02:56 +00001420PyObject *
1421PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001422{
1423 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001424
Benjamin Peterson14339b62009-01-31 16:36:08 +00001425 if (size < 0) {
1426 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001427 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001428 return NULL;
1429 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001430
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001431 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001432 some optimizations which share commonly used objects.
1433 Also, this means the input must be UTF-8, so fall back to the
1434 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001435 if (u != NULL) {
1436
Benjamin Peterson29060642009-01-31 22:14:21 +00001437 /* Optimization for empty strings */
1438 if (size == 0 && unicode_empty != NULL) {
1439 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001440 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001441 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001442
1443 /* Single characters are shared when using this constructor.
1444 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 if (size == 1 && Py_CHARMASK(*u) < 128)
1446 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001447
1448 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001449 }
1450
Walter Dörwald55507312007-05-18 13:12:10 +00001451 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001452 if (!unicode)
1453 return NULL;
1454
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001455 return (PyObject *)unicode;
1456}
1457
Alexander Belopolsky40018472011-02-26 01:02:56 +00001458PyObject *
1459PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001460{
1461 size_t size = strlen(u);
1462 if (size > PY_SSIZE_T_MAX) {
1463 PyErr_SetString(PyExc_OverflowError, "input too long");
1464 return NULL;
1465 }
1466
1467 return PyUnicode_FromStringAndSize(u, size);
1468}
1469
Victor Stinnere57b1c02011-09-28 22:20:48 +02001470static PyObject*
1471_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 PyObject *res;
1474 unsigned char max = 127;
1475 Py_ssize_t i;
1476 for (i = 0; i < size; i++) {
1477 if (u[i] & 0x80) {
1478 max = 255;
1479 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001480 }
1481 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 res = PyUnicode_New(size, max);
1483 if (!res)
1484 return NULL;
1485 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1486 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001487}
1488
Victor Stinnere57b1c02011-09-28 22:20:48 +02001489static PyObject*
1490_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491{
1492 PyObject *res;
1493 Py_UCS2 max = 0;
1494 Py_ssize_t i;
1495 for (i = 0; i < size; i++)
1496 if (u[i] > max)
1497 max = u[i];
1498 res = PyUnicode_New(size, max);
1499 if (!res)
1500 return NULL;
1501 if (max >= 256)
1502 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1503 else
1504 for (i = 0; i < size; i++)
1505 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1506 return res;
1507}
1508
Victor Stinnere57b1c02011-09-28 22:20:48 +02001509static PyObject*
1510_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511{
1512 PyObject *res;
1513 Py_UCS4 max = 0;
1514 Py_ssize_t i;
1515 for (i = 0; i < size; i++)
1516 if (u[i] > max)
1517 max = u[i];
1518 res = PyUnicode_New(size, max);
1519 if (!res)
1520 return NULL;
1521 if (max >= 0x10000)
1522 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1523 else {
1524 int kind = PyUnicode_KIND(res);
1525 void *data = PyUnicode_DATA(res);
1526 for (i = 0; i < size; i++)
1527 PyUnicode_WRITE(kind, data, i, u[i]);
1528 }
1529 return res;
1530}
1531
1532PyObject*
1533PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1534{
1535 switch(kind) {
1536 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001537 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001539 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001541 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001543 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544 return NULL;
1545}
1546
Victor Stinner034f6cf2011-09-30 02:26:44 +02001547PyObject*
1548PyUnicode_Copy(PyObject *unicode)
1549{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001550 Py_ssize_t size;
1551 PyObject *copy;
1552 void *data;
1553
Victor Stinner034f6cf2011-09-30 02:26:44 +02001554 if (!PyUnicode_Check(unicode)) {
1555 PyErr_BadInternalCall();
1556 return NULL;
1557 }
1558 if (PyUnicode_READY(unicode))
1559 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001560
1561 size = PyUnicode_GET_LENGTH(unicode);
1562 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1563 if (!copy)
1564 return NULL;
1565 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1566
1567 data = PyUnicode_DATA(unicode);
1568 switch (PyUnicode_KIND(unicode))
1569 {
1570 case PyUnicode_1BYTE_KIND:
1571 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1572 break;
1573 case PyUnicode_2BYTE_KIND:
1574 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1575 break;
1576 case PyUnicode_4BYTE_KIND:
1577 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1578 break;
1579 default:
1580 assert(0);
1581 break;
1582 }
1583 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001584}
1585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586
Victor Stinnerbc603d12011-10-02 01:00:40 +02001587/* Widen Unicode objects to larger buffers. Don't write terminating null
1588 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589
1590void*
1591_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1592{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001593 Py_ssize_t len;
1594 void *result;
1595 unsigned int skind;
1596
1597 if (PyUnicode_READY(s))
1598 return NULL;
1599
1600 len = PyUnicode_GET_LENGTH(s);
1601 skind = PyUnicode_KIND(s);
1602 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1604 return NULL;
1605 }
1606 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001607 case PyUnicode_2BYTE_KIND:
1608 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1609 if (!result)
1610 return PyErr_NoMemory();
1611 assert(skind == PyUnicode_1BYTE_KIND);
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS1, Py_UCS2,
1614 PyUnicode_1BYTE_DATA(s),
1615 PyUnicode_1BYTE_DATA(s) + len,
1616 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001617 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001618 case PyUnicode_4BYTE_KIND:
1619 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1620 if (!result)
1621 return PyErr_NoMemory();
1622 if (skind == PyUnicode_2BYTE_KIND) {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS2, Py_UCS4,
1625 PyUnicode_2BYTE_DATA(s),
1626 PyUnicode_2BYTE_DATA(s) + len,
1627 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001629 else {
1630 assert(skind == PyUnicode_1BYTE_KIND);
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS1, Py_UCS4,
1633 PyUnicode_1BYTE_DATA(s),
1634 PyUnicode_1BYTE_DATA(s) + len,
1635 result);
1636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001637 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001638 default:
1639 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001641 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 return NULL;
1643}
1644
1645static Py_UCS4*
1646as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1647 int copy_null)
1648{
1649 int kind;
1650 void *data;
1651 Py_ssize_t len, targetlen;
1652 if (PyUnicode_READY(string) == -1)
1653 return NULL;
1654 kind = PyUnicode_KIND(string);
1655 data = PyUnicode_DATA(string);
1656 len = PyUnicode_GET_LENGTH(string);
1657 targetlen = len;
1658 if (copy_null)
1659 targetlen++;
1660 if (!target) {
1661 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1662 PyErr_NoMemory();
1663 return NULL;
1664 }
1665 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1666 if (!target) {
1667 PyErr_NoMemory();
1668 return NULL;
1669 }
1670 }
1671 else {
1672 if (targetsize < targetlen) {
1673 PyErr_Format(PyExc_SystemError,
1674 "string is longer than the buffer");
1675 if (copy_null && 0 < targetsize)
1676 target[0] = 0;
1677 return NULL;
1678 }
1679 }
1680 if (kind != PyUnicode_4BYTE_KIND) {
1681 Py_ssize_t i;
1682 for (i = 0; i < len; i++)
1683 target[i] = PyUnicode_READ(kind, data, i);
1684 }
1685 else
1686 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1687 if (copy_null)
1688 target[len] = 0;
1689 return target;
1690}
1691
1692Py_UCS4*
1693PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1694 int copy_null)
1695{
1696 if (target == NULL || targetsize < 1) {
1697 PyErr_BadInternalCall();
1698 return NULL;
1699 }
1700 return as_ucs4(string, target, targetsize, copy_null);
1701}
1702
1703Py_UCS4*
1704PyUnicode_AsUCS4Copy(PyObject *string)
1705{
1706 return as_ucs4(string, NULL, 0, 1);
1707}
1708
1709#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001710
Alexander Belopolsky40018472011-02-26 01:02:56 +00001711PyObject *
1712PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001715 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001717 PyErr_BadInternalCall();
1718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 }
1720
Martin v. Löwis790465f2008-04-05 20:41:37 +00001721 if (size == -1) {
1722 size = wcslen(w);
1723 }
1724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726}
1727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001729
Walter Dörwald346737f2007-05-31 10:44:43 +00001730static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001731makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1732 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001733{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001734 *fmt++ = '%';
1735 if (width) {
1736 if (zeropad)
1737 *fmt++ = '0';
1738 fmt += sprintf(fmt, "%d", width);
1739 }
1740 if (precision)
1741 fmt += sprintf(fmt, ".%d", precision);
1742 if (longflag)
1743 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001744 else if (longlongflag) {
1745 /* longlongflag should only ever be nonzero on machines with
1746 HAVE_LONG_LONG defined */
1747#ifdef HAVE_LONG_LONG
1748 char *f = PY_FORMAT_LONG_LONG;
1749 while (*f)
1750 *fmt++ = *f++;
1751#else
1752 /* we shouldn't ever get here */
1753 assert(0);
1754 *fmt++ = 'l';
1755#endif
1756 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001757 else if (size_tflag) {
1758 char *f = PY_FORMAT_SIZE_T;
1759 while (*f)
1760 *fmt++ = *f++;
1761 }
1762 *fmt++ = c;
1763 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001764}
1765
Victor Stinner96865452011-03-01 23:44:09 +00001766/* helper for PyUnicode_FromFormatV() */
1767
1768static const char*
1769parse_format_flags(const char *f,
1770 int *p_width, int *p_precision,
1771 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1772{
1773 int width, precision, longflag, longlongflag, size_tflag;
1774
1775 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1776 f++;
1777 width = 0;
1778 while (Py_ISDIGIT((unsigned)*f))
1779 width = (width*10) + *f++ - '0';
1780 precision = 0;
1781 if (*f == '.') {
1782 f++;
1783 while (Py_ISDIGIT((unsigned)*f))
1784 precision = (precision*10) + *f++ - '0';
1785 if (*f == '%') {
1786 /* "%.3%s" => f points to "3" */
1787 f--;
1788 }
1789 }
1790 if (*f == '\0') {
1791 /* bogus format "%.1" => go backward, f points to "1" */
1792 f--;
1793 }
1794 if (p_width != NULL)
1795 *p_width = width;
1796 if (p_precision != NULL)
1797 *p_precision = precision;
1798
1799 /* Handle %ld, %lu, %lld and %llu. */
1800 longflag = 0;
1801 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001802 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001803
1804 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001805 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001806 longflag = 1;
1807 ++f;
1808 }
1809#ifdef HAVE_LONG_LONG
1810 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001811 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001812 longlongflag = 1;
1813 f += 2;
1814 }
1815#endif
1816 }
1817 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001818 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001819 size_tflag = 1;
1820 ++f;
1821 }
1822 if (p_longflag != NULL)
1823 *p_longflag = longflag;
1824 if (p_longlongflag != NULL)
1825 *p_longlongflag = longlongflag;
1826 if (p_size_tflag != NULL)
1827 *p_size_tflag = size_tflag;
1828 return f;
1829}
1830
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001831/* maximum number of characters required for output of %ld. 21 characters
1832 allows for 64-bit integers (in decimal) and an optional sign. */
1833#define MAX_LONG_CHARS 21
1834/* maximum number of characters required for output of %lld.
1835 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1836 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1837#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1838
Walter Dörwaldd2034312007-05-18 16:29:38 +00001839PyObject *
1840PyUnicode_FromFormatV(const char *format, va_list vargs)
1841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001842 va_list count;
1843 Py_ssize_t callcount = 0;
1844 PyObject **callresults = NULL;
1845 PyObject **callresult = NULL;
1846 Py_ssize_t n = 0;
1847 int width = 0;
1848 int precision = 0;
1849 int zeropad;
1850 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001852 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001853 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1855 Py_UCS4 argmaxchar;
1856 Py_ssize_t numbersize = 0;
1857 char *numberresults = NULL;
1858 char *numberresult = NULL;
1859 Py_ssize_t i;
1860 int kind;
1861 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001862
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001863 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001864 /* step 1: count the number of %S/%R/%A/%s format specifications
1865 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1866 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 * result in an array)
1868 * also esimate a upper bound for all the number formats in the string,
1869 * numbers will be formated in step 3 and be keept in a '\0'-separated
1870 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001871 for (f = format; *f; f++) {
1872 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001873 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1875 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1876 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1877 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001880#ifdef HAVE_LONG_LONG
1881 if (longlongflag) {
1882 if (width < MAX_LONG_LONG_CHARS)
1883 width = MAX_LONG_LONG_CHARS;
1884 }
1885 else
1886#endif
1887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1888 including sign. Decimal takes the most space. This
1889 isn't enough for octal. If a width is specified we
1890 need more (which we allocate later). */
1891 if (width < MAX_LONG_CHARS)
1892 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893
1894 /* account for the size + '\0' to separate numbers
1895 inside of the numberresults buffer */
1896 numbersize += (width + 1);
1897 }
1898 }
1899 else if ((unsigned char)*f > 127) {
1900 PyErr_Format(PyExc_ValueError,
1901 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1902 "string, got a non-ASCII byte: 0x%02x",
1903 (unsigned char)*f);
1904 return NULL;
1905 }
1906 }
1907 /* step 2: allocate memory for the results of
1908 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1909 if (callcount) {
1910 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1911 if (!callresults) {
1912 PyErr_NoMemory();
1913 return NULL;
1914 }
1915 callresult = callresults;
1916 }
1917 /* step 2.5: allocate memory for the results of formating numbers */
1918 if (numbersize) {
1919 numberresults = PyObject_Malloc(numbersize);
1920 if (!numberresults) {
1921 PyErr_NoMemory();
1922 goto fail;
1923 }
1924 numberresult = numberresults;
1925 }
1926
1927 /* step 3: format numbers and figure out how large a buffer we need */
1928 for (f = format; *f; f++) {
1929 if (*f == '%') {
1930 const char* p;
1931 int longflag;
1932 int longlongflag;
1933 int size_tflag;
1934 int numprinted;
1935
1936 p = f;
1937 zeropad = (f[1] == '0');
1938 f = parse_format_flags(f, &width, &precision,
1939 &longflag, &longlongflag, &size_tflag);
1940 switch (*f) {
1941 case 'c':
1942 {
1943 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001944 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 n++;
1946 break;
1947 }
1948 case '%':
1949 n++;
1950 break;
1951 case 'i':
1952 case 'd':
1953 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1954 width, precision, *f);
1955 if (longflag)
1956 numprinted = sprintf(numberresult, fmt,
1957 va_arg(count, long));
1958#ifdef HAVE_LONG_LONG
1959 else if (longlongflag)
1960 numprinted = sprintf(numberresult, fmt,
1961 va_arg(count, PY_LONG_LONG));
1962#endif
1963 else if (size_tflag)
1964 numprinted = sprintf(numberresult, fmt,
1965 va_arg(count, Py_ssize_t));
1966 else
1967 numprinted = sprintf(numberresult, fmt,
1968 va_arg(count, int));
1969 n += numprinted;
1970 /* advance by +1 to skip over the '\0' */
1971 numberresult += (numprinted + 1);
1972 assert(*(numberresult - 1) == '\0');
1973 assert(*(numberresult - 2) != '\0');
1974 assert(numprinted >= 0);
1975 assert(numberresult <= numberresults + numbersize);
1976 break;
1977 case 'u':
1978 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1979 width, precision, 'u');
1980 if (longflag)
1981 numprinted = sprintf(numberresult, fmt,
1982 va_arg(count, unsigned long));
1983#ifdef HAVE_LONG_LONG
1984 else if (longlongflag)
1985 numprinted = sprintf(numberresult, fmt,
1986 va_arg(count, unsigned PY_LONG_LONG));
1987#endif
1988 else if (size_tflag)
1989 numprinted = sprintf(numberresult, fmt,
1990 va_arg(count, size_t));
1991 else
1992 numprinted = sprintf(numberresult, fmt,
1993 va_arg(count, unsigned int));
1994 n += numprinted;
1995 numberresult += (numprinted + 1);
1996 assert(*(numberresult - 1) == '\0');
1997 assert(*(numberresult - 2) != '\0');
1998 assert(numprinted >= 0);
1999 assert(numberresult <= numberresults + numbersize);
2000 break;
2001 case 'x':
2002 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2003 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2004 n += numprinted;
2005 numberresult += (numprinted + 1);
2006 assert(*(numberresult - 1) == '\0');
2007 assert(*(numberresult - 2) != '\0');
2008 assert(numprinted >= 0);
2009 assert(numberresult <= numberresults + numbersize);
2010 break;
2011 case 'p':
2012 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2013 /* %p is ill-defined: ensure leading 0x. */
2014 if (numberresult[1] == 'X')
2015 numberresult[1] = 'x';
2016 else if (numberresult[1] != 'x') {
2017 memmove(numberresult + 2, numberresult,
2018 strlen(numberresult) + 1);
2019 numberresult[0] = '0';
2020 numberresult[1] = 'x';
2021 numprinted += 2;
2022 }
2023 n += numprinted;
2024 numberresult += (numprinted + 1);
2025 assert(*(numberresult - 1) == '\0');
2026 assert(*(numberresult - 2) != '\0');
2027 assert(numprinted >= 0);
2028 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002029 break;
2030 case 's':
2031 {
2032 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002033 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002034 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2035 if (!str)
2036 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 /* since PyUnicode_DecodeUTF8 returns already flexible
2038 unicode objects, there is no need to call ready on them */
2039 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002040 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002042 /* Remember the str and switch to the next slot */
2043 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002044 break;
2045 }
2046 case 'U':
2047 {
2048 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002049 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 if (PyUnicode_READY(obj) == -1)
2051 goto fail;
2052 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002053 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 break;
2056 }
2057 case 'V':
2058 {
2059 PyObject *obj = va_arg(count, PyObject *);
2060 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002061 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002062 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002063 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002064 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 if (PyUnicode_READY(obj) == -1)
2066 goto fail;
2067 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002068 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002070 *callresult++ = NULL;
2071 }
2072 else {
2073 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2074 if (!str_obj)
2075 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002077 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002078 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002079 *callresult++ = str_obj;
2080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 break;
2082 }
2083 case 'S':
2084 {
2085 PyObject *obj = va_arg(count, PyObject *);
2086 PyObject *str;
2087 assert(obj);
2088 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002090 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002092 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002094 /* Remember the str and switch to the next slot */
2095 *callresult++ = str;
2096 break;
2097 }
2098 case 'R':
2099 {
2100 PyObject *obj = va_arg(count, PyObject *);
2101 PyObject *repr;
2102 assert(obj);
2103 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002105 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002107 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 /* Remember the repr and switch to the next slot */
2110 *callresult++ = repr;
2111 break;
2112 }
2113 case 'A':
2114 {
2115 PyObject *obj = va_arg(count, PyObject *);
2116 PyObject *ascii;
2117 assert(obj);
2118 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002120 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002122 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002124 /* Remember the repr and switch to the next slot */
2125 *callresult++ = ascii;
2126 break;
2127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002128 default:
2129 /* if we stumble upon an unknown
2130 formatting code, copy the rest of
2131 the format string to the output
2132 string. (we cannot just skip the
2133 code, since there's no way to know
2134 what's in the argument list) */
2135 n += strlen(p);
2136 goto expand;
2137 }
2138 } else
2139 n++;
2140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002141 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002142 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 we don't have to resize the string.
2145 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002147 if (!string)
2148 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 kind = PyUnicode_KIND(string);
2150 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002151 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002156 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002157
2158 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2160 /* checking for == because the last argument could be a empty
2161 string, which causes i to point to end, the assert at the end of
2162 the loop */
2163 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002164
Benjamin Peterson14339b62009-01-31 16:36:08 +00002165 switch (*f) {
2166 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002167 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 const int ordinal = va_arg(vargs, int);
2169 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002170 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002171 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002172 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002173 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002175 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 case 'p':
2177 /* unused, since we already have the result */
2178 if (*f == 'p')
2179 (void) va_arg(vargs, void *);
2180 else
2181 (void) va_arg(vargs, int);
2182 /* extract the result from numberresults and append. */
2183 for (; *numberresult; ++i, ++numberresult)
2184 PyUnicode_WRITE(kind, data, i, *numberresult);
2185 /* skip over the separating '\0' */
2186 assert(*numberresult == '\0');
2187 numberresult++;
2188 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 break;
2190 case 's':
2191 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002192 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002194 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 size = PyUnicode_GET_LENGTH(*callresult);
2196 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002197 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2198 *callresult, 0,
2199 size) < 0)
2200 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002202 /* We're done with the unicode()/repr() => forget it */
2203 Py_DECREF(*callresult);
2204 /* switch to next unicode()/repr() result */
2205 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 break;
2207 }
2208 case 'U':
2209 {
2210 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 Py_ssize_t size;
2212 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2213 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002214 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2215 obj, 0,
2216 size) < 0)
2217 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 break;
2220 }
2221 case 'V':
2222 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002225 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002226 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 size = PyUnicode_GET_LENGTH(obj);
2228 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002229 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2230 obj, 0,
2231 size) < 0)
2232 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002234 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 size = PyUnicode_GET_LENGTH(*callresult);
2236 assert(PyUnicode_KIND(*callresult) <=
2237 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002238 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2239 *callresult,
2240 0, size) < 0)
2241 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002243 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002245 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 break;
2247 }
2248 case 'S':
2249 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002250 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 /* unused, since we already have the result */
2253 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002255 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2256 *callresult, 0,
2257 PyUnicode_GET_LENGTH(*callresult)) < 0)
2258 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 /* We're done with the unicode()/repr() => forget it */
2261 Py_DECREF(*callresult);
2262 /* switch to next unicode()/repr() result */
2263 ++callresult;
2264 break;
2265 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002268 break;
2269 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 for (; *p; ++p, ++i)
2271 PyUnicode_WRITE(kind, data, i, *p);
2272 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 goto end;
2274 }
Victor Stinner1205f272010-09-11 00:54:47 +00002275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 else {
2277 assert(i < PyUnicode_GET_LENGTH(string));
2278 PyUnicode_WRITE(kind, data, i++, *f);
2279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002282
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 if (callresults)
2285 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 if (numberresults)
2287 PyObject_Free(numberresults);
2288 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002290 if (callresults) {
2291 PyObject **callresult2 = callresults;
2292 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002293 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 ++callresult2;
2295 }
2296 PyObject_Free(callresults);
2297 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 if (numberresults)
2299 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002301}
2302
Walter Dörwaldd2034312007-05-18 16:29:38 +00002303PyObject *
2304PyUnicode_FromFormat(const char *format, ...)
2305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 PyObject* ret;
2307 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308
2309#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002310 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002311#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002312 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002313#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002314 ret = PyUnicode_FromFormatV(format, vargs);
2315 va_end(vargs);
2316 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002317}
2318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319#ifdef HAVE_WCHAR_H
2320
Victor Stinner5593d8a2010-10-02 11:11:27 +00002321/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2322 convert a Unicode object to a wide character string.
2323
Victor Stinnerd88d9832011-09-06 02:00:05 +02002324 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002325 character) required to convert the unicode object. Ignore size argument.
2326
Victor Stinnerd88d9832011-09-06 02:00:05 +02002327 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002328 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002329 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002330static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002331unicode_aswidechar(PyUnicodeObject *unicode,
2332 wchar_t *w,
2333 Py_ssize_t size)
2334{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002335 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336 const wchar_t *wstr;
2337
2338 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2339 if (wstr == NULL)
2340 return -1;
2341
Victor Stinner5593d8a2010-10-02 11:11:27 +00002342 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002343 if (size > res)
2344 size = res + 1;
2345 else
2346 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002347 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002348 return res;
2349 }
2350 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002352}
2353
2354Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002355PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002356 wchar_t *w,
2357 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358{
2359 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002360 PyErr_BadInternalCall();
2361 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002362 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002363 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002364}
2365
Victor Stinner137c34c2010-09-29 10:25:54 +00002366wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002367PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002368 Py_ssize_t *size)
2369{
2370 wchar_t* buffer;
2371 Py_ssize_t buflen;
2372
2373 if (unicode == NULL) {
2374 PyErr_BadInternalCall();
2375 return NULL;
2376 }
2377
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002378 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002379 if (buflen == -1)
2380 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002381 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002382 PyErr_NoMemory();
2383 return NULL;
2384 }
2385
Victor Stinner137c34c2010-09-29 10:25:54 +00002386 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2387 if (buffer == NULL) {
2388 PyErr_NoMemory();
2389 return NULL;
2390 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002391 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 if (buflen == -1)
2393 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002394 if (size != NULL)
2395 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002396 return buffer;
2397}
2398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400
Alexander Belopolsky40018472011-02-26 01:02:56 +00002401PyObject *
2402PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002405 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002406 PyErr_SetString(PyExc_ValueError,
2407 "chr() arg not in range(0x110000)");
2408 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002409 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 if (ordinal < 256)
2412 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 v = PyUnicode_New(1, ordinal);
2415 if (v == NULL)
2416 return NULL;
2417 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2418 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002419}
2420
Alexander Belopolsky40018472011-02-26 01:02:56 +00002421PyObject *
2422PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002424 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002425 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002426 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002427 if (PyUnicode_READY(obj))
2428 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002429 Py_INCREF(obj);
2430 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002431 }
2432 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002433 /* For a Unicode subtype that's not a Unicode object,
2434 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002435 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002436 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002437 PyErr_Format(PyExc_TypeError,
2438 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002439 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002440 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002441}
2442
Alexander Belopolsky40018472011-02-26 01:02:56 +00002443PyObject *
2444PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002445 const char *encoding,
2446 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002447{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002448 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002449 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002450
Guido van Rossumd57fd912000-03-10 22:53:23 +00002451 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002452 PyErr_BadInternalCall();
2453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002455
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002456 /* Decoding bytes objects is the most common case and should be fast */
2457 if (PyBytes_Check(obj)) {
2458 if (PyBytes_GET_SIZE(obj) == 0) {
2459 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002460 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002461 }
2462 else {
2463 v = PyUnicode_Decode(
2464 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2465 encoding, errors);
2466 }
2467 return v;
2468 }
2469
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002470 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002471 PyErr_SetString(PyExc_TypeError,
2472 "decoding str is not supported");
2473 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002475
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002476 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2477 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2478 PyErr_Format(PyExc_TypeError,
2479 "coercing to str: need bytes, bytearray "
2480 "or buffer-like object, %.80s found",
2481 Py_TYPE(obj)->tp_name);
2482 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002483 }
Tim Petersced69f82003-09-16 20:30:58 +00002484
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002485 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002487 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 }
Tim Petersced69f82003-09-16 20:30:58 +00002489 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002490 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002491
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002492 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002493 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494}
2495
Victor Stinner600d3be2010-06-10 12:00:55 +00002496/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002497 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2498 1 on success. */
2499static int
2500normalize_encoding(const char *encoding,
2501 char *lower,
2502 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002504 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002505 char *l;
2506 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002507
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002508 e = encoding;
2509 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002510 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002511 while (*e) {
2512 if (l == l_end)
2513 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002514 if (Py_ISUPPER(*e)) {
2515 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002516 }
2517 else if (*e == '_') {
2518 *l++ = '-';
2519 e++;
2520 }
2521 else {
2522 *l++ = *e++;
2523 }
2524 }
2525 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002526 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002527}
2528
Alexander Belopolsky40018472011-02-26 01:02:56 +00002529PyObject *
2530PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002531 Py_ssize_t size,
2532 const char *encoding,
2533 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002534{
2535 PyObject *buffer = NULL, *unicode;
2536 Py_buffer info;
2537 char lower[11]; /* Enough for any encoding shortcut */
2538
2539 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002540 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002541
2542 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002543 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002544 if ((strcmp(lower, "utf-8") == 0) ||
2545 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002546 return PyUnicode_DecodeUTF8(s, size, errors);
2547 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002548 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002549 (strcmp(lower, "iso-8859-1") == 0))
2550 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002551#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002552 else if (strcmp(lower, "mbcs") == 0)
2553 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002554#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002555 else if (strcmp(lower, "ascii") == 0)
2556 return PyUnicode_DecodeASCII(s, size, errors);
2557 else if (strcmp(lower, "utf-16") == 0)
2558 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2559 else if (strcmp(lower, "utf-32") == 0)
2560 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562
2563 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002564 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002565 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002566 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002567 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 if (buffer == NULL)
2569 goto onError;
2570 unicode = PyCodec_Decode(buffer, encoding, errors);
2571 if (unicode == NULL)
2572 goto onError;
2573 if (!PyUnicode_Check(unicode)) {
2574 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002575 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002576 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577 Py_DECREF(unicode);
2578 goto onError;
2579 }
2580 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002581 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 Py_DECREF(unicode);
2583 return NULL;
2584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002586
Benjamin Peterson29060642009-01-31 22:14:21 +00002587 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 Py_XDECREF(buffer);
2589 return NULL;
2590}
2591
Alexander Belopolsky40018472011-02-26 01:02:56 +00002592PyObject *
2593PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002594 const char *encoding,
2595 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002596{
2597 PyObject *v;
2598
2599 if (!PyUnicode_Check(unicode)) {
2600 PyErr_BadArgument();
2601 goto onError;
2602 }
2603
2604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002606
2607 /* Decode via the codec registry */
2608 v = PyCodec_Decode(unicode, encoding, errors);
2609 if (v == NULL)
2610 goto onError;
2611 return v;
2612
Benjamin Peterson29060642009-01-31 22:14:21 +00002613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002614 return NULL;
2615}
2616
Alexander Belopolsky40018472011-02-26 01:02:56 +00002617PyObject *
2618PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002619 const char *encoding,
2620 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002621{
2622 PyObject *v;
2623
2624 if (!PyUnicode_Check(unicode)) {
2625 PyErr_BadArgument();
2626 goto onError;
2627 }
2628
2629 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002631
2632 /* Decode via the codec registry */
2633 v = PyCodec_Decode(unicode, encoding, errors);
2634 if (v == NULL)
2635 goto onError;
2636 if (!PyUnicode_Check(v)) {
2637 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002638 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002639 Py_TYPE(v)->tp_name);
2640 Py_DECREF(v);
2641 goto onError;
2642 }
2643 return v;
2644
Benjamin Peterson29060642009-01-31 22:14:21 +00002645 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002646 return NULL;
2647}
2648
Alexander Belopolsky40018472011-02-26 01:02:56 +00002649PyObject *
2650PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002651 Py_ssize_t size,
2652 const char *encoding,
2653 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654{
2655 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002656
Guido van Rossumd57fd912000-03-10 22:53:23 +00002657 unicode = PyUnicode_FromUnicode(s, size);
2658 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2661 Py_DECREF(unicode);
2662 return v;
2663}
2664
Alexander Belopolsky40018472011-02-26 01:02:56 +00002665PyObject *
2666PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002667 const char *encoding,
2668 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002669{
2670 PyObject *v;
2671
2672 if (!PyUnicode_Check(unicode)) {
2673 PyErr_BadArgument();
2674 goto onError;
2675 }
2676
2677 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002679
2680 /* Encode via the codec registry */
2681 v = PyCodec_Encode(unicode, encoding, errors);
2682 if (v == NULL)
2683 goto onError;
2684 return v;
2685
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002687 return NULL;
2688}
2689
Victor Stinnerad158722010-10-27 00:25:46 +00002690PyObject *
2691PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002692{
Victor Stinner99b95382011-07-04 14:23:54 +02002693#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002694 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2695 PyUnicode_GET_SIZE(unicode),
2696 NULL);
2697#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002699#else
Victor Stinner793b5312011-04-27 00:24:21 +02002700 PyInterpreterState *interp = PyThreadState_GET()->interp;
2701 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2702 cannot use it to encode and decode filenames before it is loaded. Load
2703 the Python codec requires to encode at least its own filename. Use the C
2704 version of the locale codec until the codec registry is initialized and
2705 the Python codec is loaded.
2706
2707 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2708 cannot only rely on it: check also interp->fscodec_initialized for
2709 subinterpreters. */
2710 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002711 return PyUnicode_AsEncodedString(unicode,
2712 Py_FileSystemDefaultEncoding,
2713 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002714 }
2715 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002716 /* locale encoding with surrogateescape */
2717 wchar_t *wchar;
2718 char *bytes;
2719 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002720 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002721
2722 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2723 if (wchar == NULL)
2724 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002725 bytes = _Py_wchar2char(wchar, &error_pos);
2726 if (bytes == NULL) {
2727 if (error_pos != (size_t)-1) {
2728 char *errmsg = strerror(errno);
2729 PyObject *exc = NULL;
2730 if (errmsg == NULL)
2731 errmsg = "Py_wchar2char() failed";
2732 raise_encode_exception(&exc,
2733 "filesystemencoding",
2734 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2735 error_pos, error_pos+1,
2736 errmsg);
2737 Py_XDECREF(exc);
2738 }
2739 else
2740 PyErr_NoMemory();
2741 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002742 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002743 }
2744 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002745
2746 bytes_obj = PyBytes_FromString(bytes);
2747 PyMem_Free(bytes);
2748 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002749 }
Victor Stinnerad158722010-10-27 00:25:46 +00002750#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002751}
2752
Alexander Belopolsky40018472011-02-26 01:02:56 +00002753PyObject *
2754PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002755 const char *encoding,
2756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
2758 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002759 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002760
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 if (!PyUnicode_Check(unicode)) {
2762 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
Fred Drakee4315f52000-05-09 19:53:39 +00002765
Victor Stinner2f283c22011-03-02 01:21:46 +00002766 if (encoding == NULL) {
2767 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002769 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002771 }
Fred Drakee4315f52000-05-09 19:53:39 +00002772
2773 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002774 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002775 if ((strcmp(lower, "utf-8") == 0) ||
2776 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002777 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002778 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002780 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002782 }
Victor Stinner37296e82010-06-10 13:36:23 +00002783 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002784 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002785 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002787#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002788 else if (strcmp(lower, "mbcs") == 0)
2789 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2790 PyUnicode_GET_SIZE(unicode),
2791 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002792#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002793 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796
2797 /* Encode via the codec registry */
2798 v = PyCodec_Encode(unicode, encoding, errors);
2799 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002800 return NULL;
2801
2802 /* The normal path */
2803 if (PyBytes_Check(v))
2804 return v;
2805
2806 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002807 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002808 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002809 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002810
2811 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2812 "encoder %s returned bytearray instead of bytes",
2813 encoding);
2814 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002815 Py_DECREF(v);
2816 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002817 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002818
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002819 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2820 Py_DECREF(v);
2821 return b;
2822 }
2823
2824 PyErr_Format(PyExc_TypeError,
2825 "encoder did not return a bytes object (type=%.400s)",
2826 Py_TYPE(v)->tp_name);
2827 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002828 return NULL;
2829}
2830
Alexander Belopolsky40018472011-02-26 01:02:56 +00002831PyObject *
2832PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002833 const char *encoding,
2834 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002835{
2836 PyObject *v;
2837
2838 if (!PyUnicode_Check(unicode)) {
2839 PyErr_BadArgument();
2840 goto onError;
2841 }
2842
2843 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002844 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002845
2846 /* Encode via the codec registry */
2847 v = PyCodec_Encode(unicode, encoding, errors);
2848 if (v == NULL)
2849 goto onError;
2850 if (!PyUnicode_Check(v)) {
2851 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002852 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002853 Py_TYPE(v)->tp_name);
2854 Py_DECREF(v);
2855 goto onError;
2856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002858
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002860 return NULL;
2861}
2862
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002863PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002864PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002865 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002866 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2867}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002868
Christian Heimes5894ba72007-11-04 11:43:14 +00002869PyObject*
2870PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2871{
Victor Stinner99b95382011-07-04 14:23:54 +02002872#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002873 return PyUnicode_DecodeMBCS(s, size, NULL);
2874#elif defined(__APPLE__)
2875 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2876#else
Victor Stinner793b5312011-04-27 00:24:21 +02002877 PyInterpreterState *interp = PyThreadState_GET()->interp;
2878 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2879 cannot use it to encode and decode filenames before it is loaded. Load
2880 the Python codec requires to encode at least its own filename. Use the C
2881 version of the locale codec until the codec registry is initialized and
2882 the Python codec is loaded.
2883
2884 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2885 cannot only rely on it: check also interp->fscodec_initialized for
2886 subinterpreters. */
2887 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002888 return PyUnicode_Decode(s, size,
2889 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002890 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002891 }
2892 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002893 /* locale encoding with surrogateescape */
2894 wchar_t *wchar;
2895 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002896 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002897
2898 if (s[size] != '\0' || size != strlen(s)) {
2899 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2900 return NULL;
2901 }
2902
Victor Stinner168e1172010-10-16 23:16:16 +00002903 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002904 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002905 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002906
Victor Stinner168e1172010-10-16 23:16:16 +00002907 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002908 PyMem_Free(wchar);
2909 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002910 }
Victor Stinnerad158722010-10-27 00:25:46 +00002911#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002912}
2913
Martin v. Löwis011e8422009-05-05 04:43:17 +00002914
2915int
2916PyUnicode_FSConverter(PyObject* arg, void* addr)
2917{
2918 PyObject *output = NULL;
2919 Py_ssize_t size;
2920 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002921 if (arg == NULL) {
2922 Py_DECREF(*(PyObject**)addr);
2923 return 1;
2924 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002925 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002926 output = arg;
2927 Py_INCREF(output);
2928 }
2929 else {
2930 arg = PyUnicode_FromObject(arg);
2931 if (!arg)
2932 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002933 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002934 Py_DECREF(arg);
2935 if (!output)
2936 return 0;
2937 if (!PyBytes_Check(output)) {
2938 Py_DECREF(output);
2939 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2940 return 0;
2941 }
2942 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002943 size = PyBytes_GET_SIZE(output);
2944 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002945 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002946 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002947 Py_DECREF(output);
2948 return 0;
2949 }
2950 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002951 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002952}
2953
2954
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002955int
2956PyUnicode_FSDecoder(PyObject* arg, void* addr)
2957{
2958 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002959 if (arg == NULL) {
2960 Py_DECREF(*(PyObject**)addr);
2961 return 1;
2962 }
2963 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002964 if (PyUnicode_READY(arg))
2965 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002966 output = arg;
2967 Py_INCREF(output);
2968 }
2969 else {
2970 arg = PyBytes_FromObject(arg);
2971 if (!arg)
2972 return 0;
2973 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2974 PyBytes_GET_SIZE(arg));
2975 Py_DECREF(arg);
2976 if (!output)
2977 return 0;
2978 if (!PyUnicode_Check(output)) {
2979 Py_DECREF(output);
2980 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2981 return 0;
2982 }
2983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002984 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2985 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002986 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2987 Py_DECREF(output);
2988 return 0;
2989 }
2990 *(PyObject**)addr = output;
2991 return Py_CLEANUP_SUPPORTED;
2992}
2993
2994
Martin v. Löwis5b222132007-06-10 09:51:05 +00002995char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002996PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002997{
Christian Heimesf3863112007-11-22 07:46:41 +00002998 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3000
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 return NULL;
3004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003006 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003007
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003008 if (PyUnicode_UTF8(unicode) == NULL) {
3009 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003010 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3011 if (bytes == NULL)
3012 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003013 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3014 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003015 Py_DECREF(bytes);
3016 return NULL;
3017 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003018 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3019 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003020 Py_DECREF(bytes);
3021 }
3022
3023 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003024 *psize = PyUnicode_UTF8_LENGTH(unicode);
3025 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003026}
3027
3028char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003029PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003030{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003031 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3032}
3033
3034#ifdef Py_DEBUG
3035int unicode_as_unicode_calls = 0;
3036#endif
3037
3038
3039Py_UNICODE *
3040PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3041{
3042 PyUnicodeObject *u;
3043 const unsigned char *one_byte;
3044#if SIZEOF_WCHAR_T == 4
3045 const Py_UCS2 *two_bytes;
3046#else
3047 const Py_UCS4 *four_bytes;
3048 const Py_UCS4 *ucs4_end;
3049 Py_ssize_t num_surrogates;
3050#endif
3051 wchar_t *w;
3052 wchar_t *wchar_end;
3053
3054 if (!PyUnicode_Check(unicode)) {
3055 PyErr_BadArgument();
3056 return NULL;
3057 }
3058 u = (PyUnicodeObject*)unicode;
3059 if (_PyUnicode_WSTR(u) == NULL) {
3060 /* Non-ASCII compact unicode object */
3061 assert(_PyUnicode_KIND(u) != 0);
3062 assert(PyUnicode_IS_READY(u));
3063
3064#ifdef Py_DEBUG
3065 ++unicode_as_unicode_calls;
3066#endif
3067
3068 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3069#if SIZEOF_WCHAR_T == 2
3070 four_bytes = PyUnicode_4BYTE_DATA(u);
3071 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3072 num_surrogates = 0;
3073
3074 for (; four_bytes < ucs4_end; ++four_bytes) {
3075 if (*four_bytes > 0xFFFF)
3076 ++num_surrogates;
3077 }
3078
3079 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3080 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3081 if (!_PyUnicode_WSTR(u)) {
3082 PyErr_NoMemory();
3083 return NULL;
3084 }
3085 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3086
3087 w = _PyUnicode_WSTR(u);
3088 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3089 four_bytes = PyUnicode_4BYTE_DATA(u);
3090 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3091 if (*four_bytes > 0xFFFF) {
3092 /* encode surrogate pair in this case */
3093 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3094 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3095 }
3096 else
3097 *w = *four_bytes;
3098
3099 if (w > wchar_end) {
3100 assert(0 && "Miscalculated string end");
3101 }
3102 }
3103 *w = 0;
3104#else
3105 /* sizeof(wchar_t) == 4 */
3106 Py_FatalError("Impossible unicode object state, wstr and str "
3107 "should share memory already.");
3108 return NULL;
3109#endif
3110 }
3111 else {
3112 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3113 (_PyUnicode_LENGTH(u) + 1));
3114 if (!_PyUnicode_WSTR(u)) {
3115 PyErr_NoMemory();
3116 return NULL;
3117 }
3118 if (!PyUnicode_IS_COMPACT_ASCII(u))
3119 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3120 w = _PyUnicode_WSTR(u);
3121 wchar_end = w + _PyUnicode_LENGTH(u);
3122
3123 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3124 one_byte = PyUnicode_1BYTE_DATA(u);
3125 for (; w < wchar_end; ++one_byte, ++w)
3126 *w = *one_byte;
3127 /* null-terminate the wstr */
3128 *w = 0;
3129 }
3130 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3131#if SIZEOF_WCHAR_T == 4
3132 two_bytes = PyUnicode_2BYTE_DATA(u);
3133 for (; w < wchar_end; ++two_bytes, ++w)
3134 *w = *two_bytes;
3135 /* null-terminate the wstr */
3136 *w = 0;
3137#else
3138 /* sizeof(wchar_t) == 2 */
3139 PyObject_FREE(_PyUnicode_WSTR(u));
3140 _PyUnicode_WSTR(u) = NULL;
3141 Py_FatalError("Impossible unicode object state, wstr "
3142 "and str should share memory already.");
3143 return NULL;
3144#endif
3145 }
3146 else {
3147 assert(0 && "This should never happen.");
3148 }
3149 }
3150 }
3151 if (size != NULL)
3152 *size = PyUnicode_WSTR_LENGTH(u);
3153 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003154}
3155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156Py_UNICODE *
3157PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003159 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160}
3161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163Py_ssize_t
3164PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165{
3166 if (!PyUnicode_Check(unicode)) {
3167 PyErr_BadArgument();
3168 goto onError;
3169 }
3170 return PyUnicode_GET_SIZE(unicode);
3171
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 return -1;
3174}
3175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003176Py_ssize_t
3177PyUnicode_GetLength(PyObject *unicode)
3178{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003179 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003180 PyErr_BadArgument();
3181 return -1;
3182 }
3183
3184 return PyUnicode_GET_LENGTH(unicode);
3185}
3186
3187Py_UCS4
3188PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3189{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003190 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3191 PyErr_BadArgument();
3192 return (Py_UCS4)-1;
3193 }
3194 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3195 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003196 return (Py_UCS4)-1;
3197 }
3198 return PyUnicode_READ_CHAR(unicode, index);
3199}
3200
3201int
3202PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3203{
3204 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003205 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003206 return -1;
3207 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003208 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3209 PyErr_SetString(PyExc_IndexError, "string index out of range");
3210 return -1;
3211 }
3212 if (_PyUnicode_Dirty(unicode))
3213 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003214 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3215 index, ch);
3216 return 0;
3217}
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219const char *
3220PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003221{
Victor Stinner42cb4622010-09-01 19:39:01 +00003222 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003223}
3224
Victor Stinner554f3f02010-06-16 23:33:54 +00003225/* create or adjust a UnicodeDecodeError */
3226static void
3227make_decode_exception(PyObject **exceptionObject,
3228 const char *encoding,
3229 const char *input, Py_ssize_t length,
3230 Py_ssize_t startpos, Py_ssize_t endpos,
3231 const char *reason)
3232{
3233 if (*exceptionObject == NULL) {
3234 *exceptionObject = PyUnicodeDecodeError_Create(
3235 encoding, input, length, startpos, endpos, reason);
3236 }
3237 else {
3238 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3239 goto onError;
3240 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3241 goto onError;
3242 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3243 goto onError;
3244 }
3245 return;
3246
3247onError:
3248 Py_DECREF(*exceptionObject);
3249 *exceptionObject = NULL;
3250}
3251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252/* error handling callback helper:
3253 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003254 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 and adjust various state variables.
3256 return 0 on success, -1 on error
3257*/
3258
Alexander Belopolsky40018472011-02-26 01:02:56 +00003259static int
3260unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003261 const char *encoding, const char *reason,
3262 const char **input, const char **inend, Py_ssize_t *startinpos,
3263 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3264 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003265{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003266 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267
3268 PyObject *restuple = NULL;
3269 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003271 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003272 Py_ssize_t requiredsize;
3273 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003274 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003275 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003276 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 int res = -1;
3278
3279 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003280 *errorHandler = PyCodec_LookupError(errors);
3281 if (*errorHandler == NULL)
3282 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 }
3284
Victor Stinner554f3f02010-06-16 23:33:54 +00003285 make_decode_exception(exceptionObject,
3286 encoding,
3287 *input, *inend - *input,
3288 *startinpos, *endinpos,
3289 reason);
3290 if (*exceptionObject == NULL)
3291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003292
3293 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3294 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003297 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003299 }
3300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003301 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003302
3303 /* Copy back the bytes variables, which might have been modified by the
3304 callback */
3305 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3306 if (!inputobj)
3307 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003308 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003309 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003310 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003311 *input = PyBytes_AS_STRING(inputobj);
3312 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003313 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003314 /* we can DECREF safely, as the exception has another reference,
3315 so the object won't go away. */
3316 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003317
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003320 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3322 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003323 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003324
3325 /* need more space? (at least enough for what we
3326 have+the replacement+the rest of the string (starting
3327 at the new input position), so we won't have to check space
3328 when there are no errors in the rest of the string) */
3329 repptr = PyUnicode_AS_UNICODE(repunicode);
3330 repsize = PyUnicode_GET_SIZE(repunicode);
3331 requiredsize = *outpos + repsize + insize-newpos;
3332 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 if (requiredsize<2*outsize)
3334 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003335 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 goto onError;
3337 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003338 }
3339 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003340 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 Py_UNICODE_COPY(*outptr, repptr, repsize);
3342 *outptr += repsize;
3343 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 /* we made it! */
3346 res = 0;
3347
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003349 Py_XDECREF(restuple);
3350 return res;
3351}
3352
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003353/* --- UTF-7 Codec -------------------------------------------------------- */
3354
Antoine Pitrou244651a2009-05-04 18:56:13 +00003355/* See RFC2152 for details. We encode conservatively and decode liberally. */
3356
3357/* Three simple macros defining base-64. */
3358
3359/* Is c a base-64 character? */
3360
3361#define IS_BASE64(c) \
3362 (((c) >= 'A' && (c) <= 'Z') || \
3363 ((c) >= 'a' && (c) <= 'z') || \
3364 ((c) >= '0' && (c) <= '9') || \
3365 (c) == '+' || (c) == '/')
3366
3367/* given that c is a base-64 character, what is its base-64 value? */
3368
3369#define FROM_BASE64(c) \
3370 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3371 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3372 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3373 (c) == '+' ? 62 : 63)
3374
3375/* What is the base-64 character of the bottom 6 bits of n? */
3376
3377#define TO_BASE64(n) \
3378 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3379
3380/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3381 * decoded as itself. We are permissive on decoding; the only ASCII
3382 * byte not decoding to itself is the + which begins a base64
3383 * string. */
3384
3385#define DECODE_DIRECT(c) \
3386 ((c) <= 127 && (c) != '+')
3387
3388/* The UTF-7 encoder treats ASCII characters differently according to
3389 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3390 * the above). See RFC2152. This array identifies these different
3391 * sets:
3392 * 0 : "Set D"
3393 * alphanumeric and '(),-./:?
3394 * 1 : "Set O"
3395 * !"#$%&*;<=>@[]^_`{|}
3396 * 2 : "whitespace"
3397 * ht nl cr sp
3398 * 3 : special (must be base64 encoded)
3399 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3400 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003401
Tim Petersced69f82003-09-16 20:30:58 +00003402static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003403char utf7_category[128] = {
3404/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3405 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3406/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3407 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3408/* sp ! " # $ % & ' ( ) * + , - . / */
3409 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3410/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3411 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3412/* @ A B C D E F G H I J K L M N O */
3413 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3414/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3416/* ` a b c d e f g h i j k l m n o */
3417 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3418/* p q r s t u v w x y z { | } ~ del */
3419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003420};
3421
Antoine Pitrou244651a2009-05-04 18:56:13 +00003422/* ENCODE_DIRECT: this character should be encoded as itself. The
3423 * answer depends on whether we are encoding set O as itself, and also
3424 * on whether we are encoding whitespace as itself. RFC2152 makes it
3425 * clear that the answers to these questions vary between
3426 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003427
Antoine Pitrou244651a2009-05-04 18:56:13 +00003428#define ENCODE_DIRECT(c, directO, directWS) \
3429 ((c) < 128 && (c) > 0 && \
3430 ((utf7_category[(c)] == 0) || \
3431 (directWS && (utf7_category[(c)] == 2)) || \
3432 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003433
Alexander Belopolsky40018472011-02-26 01:02:56 +00003434PyObject *
3435PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003436 Py_ssize_t size,
3437 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003438{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003439 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3440}
3441
Antoine Pitrou244651a2009-05-04 18:56:13 +00003442/* The decoder. The only state we preserve is our read position,
3443 * i.e. how many characters we have consumed. So if we end in the
3444 * middle of a shift sequence we have to back off the read position
3445 * and the output to the beginning of the sequence, otherwise we lose
3446 * all the shift state (seen bits, number of bits seen, high
3447 * surrogate). */
3448
Alexander Belopolsky40018472011-02-26 01:02:56 +00003449PyObject *
3450PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003451 Py_ssize_t size,
3452 const char *errors,
3453 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003454{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003455 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003456 Py_ssize_t startinpos;
3457 Py_ssize_t endinpos;
3458 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003459 const char *e;
3460 PyUnicodeObject *unicode;
3461 Py_UNICODE *p;
3462 const char *errmsg = "";
3463 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003464 Py_UNICODE *shiftOutStart;
3465 unsigned int base64bits = 0;
3466 unsigned long base64buffer = 0;
3467 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468 PyObject *errorHandler = NULL;
3469 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003470
3471 unicode = _PyUnicode_New(size);
3472 if (!unicode)
3473 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003474 if (size == 0) {
3475 if (consumed)
3476 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003477 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003478 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003480 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003481 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003482 e = s + size;
3483
3484 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003485 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003486 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003487 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003488
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489 if (inShift) { /* in a base-64 section */
3490 if (IS_BASE64(ch)) { /* consume a base-64 character */
3491 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3492 base64bits += 6;
3493 s++;
3494 if (base64bits >= 16) {
3495 /* we have enough bits for a UTF-16 value */
3496 Py_UNICODE outCh = (Py_UNICODE)
3497 (base64buffer >> (base64bits-16));
3498 base64bits -= 16;
3499 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3500 if (surrogate) {
3501 /* expecting a second surrogate */
3502 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3503#ifdef Py_UNICODE_WIDE
3504 *p++ = (((surrogate & 0x3FF)<<10)
3505 | (outCh & 0x3FF)) + 0x10000;
3506#else
3507 *p++ = surrogate;
3508 *p++ = outCh;
3509#endif
3510 surrogate = 0;
3511 }
3512 else {
3513 surrogate = 0;
3514 errmsg = "second surrogate missing";
3515 goto utf7Error;
3516 }
3517 }
3518 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3519 /* first surrogate */
3520 surrogate = outCh;
3521 }
3522 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3523 errmsg = "unexpected second surrogate";
3524 goto utf7Error;
3525 }
3526 else {
3527 *p++ = outCh;
3528 }
3529 }
3530 }
3531 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003532 inShift = 0;
3533 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003534 if (surrogate) {
3535 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003536 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003538 if (base64bits > 0) { /* left-over bits */
3539 if (base64bits >= 6) {
3540 /* We've seen at least one base-64 character */
3541 errmsg = "partial character in shift sequence";
3542 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003543 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003544 else {
3545 /* Some bits remain; they should be zero */
3546 if (base64buffer != 0) {
3547 errmsg = "non-zero padding bits in shift sequence";
3548 goto utf7Error;
3549 }
3550 }
3551 }
3552 if (ch != '-') {
3553 /* '-' is absorbed; other terminating
3554 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003555 *p++ = ch;
3556 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003557 }
3558 }
3559 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003561 s++; /* consume '+' */
3562 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003563 s++;
3564 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003565 }
3566 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003568 shiftOutStart = p;
3569 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003570 }
3571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003572 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003573 *p++ = ch;
3574 s++;
3575 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003576 else {
3577 startinpos = s-starts;
3578 s++;
3579 errmsg = "unexpected special character";
3580 goto utf7Error;
3581 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003582 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003583utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 outpos = p-PyUnicode_AS_UNICODE(unicode);
3585 endinpos = s-starts;
3586 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 errors, &errorHandler,
3588 "utf7", errmsg,
3589 &starts, &e, &startinpos, &endinpos, &exc, &s,
3590 &unicode, &outpos, &p))
3591 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003592 }
3593
Antoine Pitrou244651a2009-05-04 18:56:13 +00003594 /* end of string */
3595
3596 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3597 /* if we're in an inconsistent state, that's an error */
3598 if (surrogate ||
3599 (base64bits >= 6) ||
3600 (base64bits > 0 && base64buffer != 0)) {
3601 outpos = p-PyUnicode_AS_UNICODE(unicode);
3602 endinpos = size;
3603 if (unicode_decode_call_errorhandler(
3604 errors, &errorHandler,
3605 "utf7", "unterminated shift sequence",
3606 &starts, &e, &startinpos, &endinpos, &exc, &s,
3607 &unicode, &outpos, &p))
3608 goto onError;
3609 if (s < e)
3610 goto restart;
3611 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003612 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613
3614 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003615 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003616 if (inShift) {
3617 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003618 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003619 }
3620 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003621 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003622 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003623 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003624
Victor Stinnerfe226c02011-10-03 03:52:20 +02003625 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003626 goto onError;
3627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003630 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003631 Py_DECREF(unicode);
3632 return NULL;
3633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003634 return (PyObject *)unicode;
3635
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 Py_XDECREF(errorHandler);
3638 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003639 Py_DECREF(unicode);
3640 return NULL;
3641}
3642
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644PyObject *
3645PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 Py_ssize_t size,
3647 int base64SetO,
3648 int base64WhiteSpace,
3649 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003650{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003651 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003652 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003653 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003654 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003655 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003656 unsigned int base64bits = 0;
3657 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658 char * out;
3659 char * start;
3660
3661 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003663
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003664 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003665 return PyErr_NoMemory();
3666
Antoine Pitrou244651a2009-05-04 18:56:13 +00003667 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003668 if (v == NULL)
3669 return NULL;
3670
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003671 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672 for (;i < size; ++i) {
3673 Py_UNICODE ch = s[i];
3674
Antoine Pitrou244651a2009-05-04 18:56:13 +00003675 if (inShift) {
3676 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3677 /* shifting out */
3678 if (base64bits) { /* output remaining bits */
3679 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3680 base64buffer = 0;
3681 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003682 }
3683 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003684 /* Characters not in the BASE64 set implicitly unshift the sequence
3685 so no '-' is required, except if the character is itself a '-' */
3686 if (IS_BASE64(ch) || ch == '-') {
3687 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003688 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003689 *out++ = (char) ch;
3690 }
3691 else {
3692 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003693 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003694 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003695 else { /* not in a shift sequence */
3696 if (ch == '+') {
3697 *out++ = '+';
3698 *out++ = '-';
3699 }
3700 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3701 *out++ = (char) ch;
3702 }
3703 else {
3704 *out++ = '+';
3705 inShift = 1;
3706 goto encode_char;
3707 }
3708 }
3709 continue;
3710encode_char:
3711#ifdef Py_UNICODE_WIDE
3712 if (ch >= 0x10000) {
3713 /* code first surrogate */
3714 base64bits += 16;
3715 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3716 while (base64bits >= 6) {
3717 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3718 base64bits -= 6;
3719 }
3720 /* prepare second surrogate */
3721 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3722 }
3723#endif
3724 base64bits += 16;
3725 base64buffer = (base64buffer << 16) | ch;
3726 while (base64bits >= 6) {
3727 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3728 base64bits -= 6;
3729 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003730 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003731 if (base64bits)
3732 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3733 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003734 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003735 if (_PyBytes_Resize(&v, out - start) < 0)
3736 return NULL;
3737 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738}
3739
Antoine Pitrou244651a2009-05-04 18:56:13 +00003740#undef IS_BASE64
3741#undef FROM_BASE64
3742#undef TO_BASE64
3743#undef DECODE_DIRECT
3744#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003745
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746/* --- UTF-8 Codec -------------------------------------------------------- */
3747
Tim Petersced69f82003-09-16 20:30:58 +00003748static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003750 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3751 illegal prefix. See RFC 3629 for details */
3752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3764 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3765 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3766 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3767 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768};
3769
Alexander Belopolsky40018472011-02-26 01:02:56 +00003770PyObject *
3771PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003772 Py_ssize_t size,
3773 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774{
Walter Dörwald69652032004-09-07 20:24:22 +00003775 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3776}
3777
Antoine Pitrouab868312009-01-10 15:40:25 +00003778/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3779#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3780
3781/* Mask to quickly check whether a C 'long' contains a
3782 non-ASCII, UTF8-encoded char. */
3783#if (SIZEOF_LONG == 8)
3784# define ASCII_CHAR_MASK 0x8080808080808080L
3785#elif (SIZEOF_LONG == 4)
3786# define ASCII_CHAR_MASK 0x80808080L
3787#else
3788# error C 'long' size should be either 4 or 8!
3789#endif
3790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791/* Scans a UTF-8 string and returns the maximum character to be expected,
3792 the size of the decoded unicode string and if any major errors were
3793 encountered.
3794
3795 This function does check basic UTF-8 sanity, it does however NOT CHECK
3796 if the string contains surrogates, and if all continuation bytes are
3797 within the correct ranges, these checks are performed in
3798 PyUnicode_DecodeUTF8Stateful.
3799
3800 If it sets has_errors to 1, it means the value of unicode_size and max_char
3801 will be bogus and you should not rely on useful information in them.
3802 */
3803static Py_UCS4
3804utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3805 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3806 int *has_errors)
3807{
3808 Py_ssize_t n;
3809 Py_ssize_t char_count = 0;
3810 Py_UCS4 max_char = 127, new_max;
3811 Py_UCS4 upper_bound;
3812 const unsigned char *p = (const unsigned char *)s;
3813 const unsigned char *end = p + string_size;
3814 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3815 int err = 0;
3816
3817 for (; p < end && !err; ++p, ++char_count) {
3818 /* Only check value if it's not a ASCII char... */
3819 if (*p < 0x80) {
3820 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3821 an explanation. */
3822 if (!((size_t) p & LONG_PTR_MASK)) {
3823 /* Help register allocation */
3824 register const unsigned char *_p = p;
3825 while (_p < aligned_end) {
3826 unsigned long value = *(unsigned long *) _p;
3827 if (value & ASCII_CHAR_MASK)
3828 break;
3829 _p += SIZEOF_LONG;
3830 char_count += SIZEOF_LONG;
3831 }
3832 p = _p;
3833 if (p == end)
3834 break;
3835 }
3836 }
3837 if (*p >= 0x80) {
3838 n = utf8_code_length[*p];
3839 new_max = max_char;
3840 switch (n) {
3841 /* invalid start byte */
3842 case 0:
3843 err = 1;
3844 break;
3845 case 2:
3846 /* Code points between 0x00FF and 0x07FF inclusive.
3847 Approximate the upper bound of the code point,
3848 if this flips over 255 we can be sure it will be more
3849 than 255 and the string will need 2 bytes per code coint,
3850 if it stays under or equal to 255, we can be sure 1 byte
3851 is enough.
3852 ((*p & 0b00011111) << 6) | 0b00111111 */
3853 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3854 if (max_char < upper_bound)
3855 new_max = upper_bound;
3856 /* Ensure we track at least that we left ASCII space. */
3857 if (new_max < 128)
3858 new_max = 128;
3859 break;
3860 case 3:
3861 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3862 always > 255 and <= 65535 and will always need 2 bytes. */
3863 if (max_char < 65535)
3864 new_max = 65535;
3865 break;
3866 case 4:
3867 /* Code point will be above 0xFFFF for sure in this case. */
3868 new_max = 65537;
3869 break;
3870 /* Internal error, this should be caught by the first if */
3871 case 1:
3872 default:
3873 assert(0 && "Impossible case in utf8_max_char_and_size");
3874 err = 1;
3875 }
3876 /* Instead of number of overall bytes for this code point,
3877 n containts the number of following bytes: */
3878 --n;
3879 /* Check if the follow up chars are all valid continuation bytes */
3880 if (n >= 1) {
3881 const unsigned char *cont;
3882 if ((p + n) >= end) {
3883 if (consumed == 0)
3884 /* incomplete data, non-incremental decoding */
3885 err = 1;
3886 break;
3887 }
3888 for (cont = p + 1; cont < (p + n); ++cont) {
3889 if ((*cont & 0xc0) != 0x80) {
3890 err = 1;
3891 break;
3892 }
3893 }
3894 p += n;
3895 }
3896 else
3897 err = 1;
3898 max_char = new_max;
3899 }
3900 }
3901
3902 if (unicode_size)
3903 *unicode_size = char_count;
3904 if (has_errors)
3905 *has_errors = err;
3906 return max_char;
3907}
3908
3909/* Similar to PyUnicode_WRITE but can also write into wstr field
3910 of the legacy unicode representation */
3911#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3912 do { \
3913 const int k_ = (kind); \
3914 if (k_ == PyUnicode_WCHAR_KIND) \
3915 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3916 else if (k_ == PyUnicode_1BYTE_KIND) \
3917 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3918 else if (k_ == PyUnicode_2BYTE_KIND) \
3919 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3920 else \
3921 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3922 } while (0)
3923
Alexander Belopolsky40018472011-02-26 01:02:56 +00003924PyObject *
3925PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 Py_ssize_t size,
3927 const char *errors,
3928 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003931 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003932 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003933 Py_ssize_t startinpos;
3934 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003935 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003937 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003938 PyObject *errorHandler = NULL;
3939 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940 Py_UCS4 maxchar = 0;
3941 Py_ssize_t unicode_size;
3942 Py_ssize_t i;
3943 int kind;
3944 void *data;
3945 int has_errors;
3946 Py_UNICODE *error_outptr;
3947#if SIZEOF_WCHAR_T == 2
3948 Py_ssize_t wchar_offset = 0;
3949#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950
Walter Dörwald69652032004-09-07 20:24:22 +00003951 if (size == 0) {
3952 if (consumed)
3953 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3957 consumed, &has_errors);
3958 if (has_errors) {
3959 unicode = _PyUnicode_New(size);
3960 if (!unicode)
3961 return NULL;
3962 kind = PyUnicode_WCHAR_KIND;
3963 data = PyUnicode_AS_UNICODE(unicode);
3964 assert(data != NULL);
3965 }
3966 else {
3967 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3968 if (!unicode)
3969 return NULL;
3970 /* When the string is ASCII only, just use memcpy and return.
3971 unicode_size may be != size if there is an incomplete UTF-8
3972 sequence at the end of the ASCII block. */
3973 if (maxchar < 128 && size == unicode_size) {
3974 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3975 return (PyObject *)unicode;
3976 }
3977 kind = PyUnicode_KIND(unicode);
3978 data = PyUnicode_DATA(unicode);
3979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003983 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003984
3985 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003986 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987
3988 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003989 /* Fast path for runs of ASCII characters. Given that common UTF-8
3990 input will consist of an overwhelming majority of ASCII
3991 characters, we try to optimize for this case by checking
3992 as many characters as a C 'long' can contain.
3993 First, check if we can do an aligned read, as most CPUs have
3994 a penalty for unaligned reads.
3995 */
3996 if (!((size_t) s & LONG_PTR_MASK)) {
3997 /* Help register allocation */
3998 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004000 while (_s < aligned_end) {
4001 /* Read a whole long at a time (either 4 or 8 bytes),
4002 and do a fast unrolled copy if it only contains ASCII
4003 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004004 unsigned long value = *(unsigned long *) _s;
4005 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004006 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4008 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4009 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4010 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004011#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4013 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4014 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4015 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004016#endif
4017 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004019 }
4020 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004022 if (s == e)
4023 break;
4024 ch = (unsigned char)*s;
4025 }
4026 }
4027
4028 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030 s++;
4031 continue;
4032 }
4033
4034 n = utf8_code_length[ch];
4035
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004036 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 if (consumed)
4038 break;
4039 else {
4040 errmsg = "unexpected end of data";
4041 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004042 endinpos = startinpos+1;
4043 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4044 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004045 goto utf8Error;
4046 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004047 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048
4049 switch (n) {
4050
4051 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004052 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 startinpos = s-starts;
4054 endinpos = startinpos+1;
4055 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
4057 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004058 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 startinpos = s-starts;
4060 endinpos = startinpos+1;
4061 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062
4063 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004064 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004065 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004067 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004068 goto utf8Error;
4069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004071 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 break;
4074
4075 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004076 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4077 will result in surrogates in range d800-dfff. Surrogates are
4078 not valid UTF-8 so they are rejected.
4079 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4080 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004081 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004082 (s[2] & 0xc0) != 0x80 ||
4083 ((unsigned char)s[0] == 0xE0 &&
4084 (unsigned char)s[1] < 0xA0) ||
4085 ((unsigned char)s[0] == 0xED &&
4086 (unsigned char)s[1] > 0x9F)) {
4087 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004089 endinpos = startinpos + 1;
4090
4091 /* if s[1] first two bits are 1 and 0, then the invalid
4092 continuation byte is s[2], so increment endinpos by 1,
4093 if not, s[1] is invalid and endinpos doesn't need to
4094 be incremented. */
4095 if ((s[1] & 0xC0) == 0x80)
4096 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 goto utf8Error;
4098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004100 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004101 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004102 break;
4103
4104 case 4:
4105 if ((s[1] & 0xc0) != 0x80 ||
4106 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004107 (s[3] & 0xc0) != 0x80 ||
4108 ((unsigned char)s[0] == 0xF0 &&
4109 (unsigned char)s[1] < 0x90) ||
4110 ((unsigned char)s[0] == 0xF4 &&
4111 (unsigned char)s[1] > 0x8F)) {
4112 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004114 endinpos = startinpos + 1;
4115 if ((s[1] & 0xC0) == 0x80) {
4116 endinpos++;
4117 if ((s[2] & 0xC0) == 0x80)
4118 endinpos++;
4119 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 goto utf8Error;
4121 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004122 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004123 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4124 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 /* If the string is flexible or we have native UCS-4, write
4127 directly.. */
4128 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4129 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 else {
4132 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 /* translate from 10000..10FFFF to 0..FFFF */
4135 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137 /* high surrogate = top 10 bits added to D800 */
4138 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4139 (Py_UNICODE)(0xD800 + (ch >> 10)));
4140
4141 /* low surrogate = bottom 10 bits added to DC00 */
4142 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4143 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4144 }
4145#if SIZEOF_WCHAR_T == 2
4146 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004147#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 }
4150 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004152
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004154 /* If this is not yet a resizable string, make it one.. */
4155 if (kind != PyUnicode_WCHAR_KIND) {
4156 const Py_UNICODE *u;
4157 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4158 if (!new_unicode)
4159 goto onError;
4160 u = PyUnicode_AsUnicode((PyObject *)unicode);
4161 if (!u)
4162 goto onError;
4163#if SIZEOF_WCHAR_T == 2
4164 i += wchar_offset;
4165#endif
4166 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4167 Py_DECREF(unicode);
4168 unicode = new_unicode;
4169 kind = 0;
4170 data = PyUnicode_AS_UNICODE(new_unicode);
4171 assert(data != NULL);
4172 }
4173 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 if (unicode_decode_call_errorhandler(
4175 errors, &errorHandler,
4176 "utf8", errmsg,
4177 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004178 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004179 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004180 /* Update data because unicode_decode_call_errorhandler might have
4181 re-created or resized the unicode object. */
4182 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 /* Ensure the unicode_size calculation above was correct: */
4186 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4187
Walter Dörwald69652032004-09-07 20:24:22 +00004188 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004191 /* Adjust length and ready string when it contained errors and
4192 is of the old resizable kind. */
4193 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004194 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 goto onError;
4196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 Py_XDECREF(errorHandler);
4199 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004200 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 Py_DECREF(unicode);
4202 return NULL;
4203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 return (PyObject *)unicode;
4205
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004207 Py_XDECREF(errorHandler);
4208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 Py_DECREF(unicode);
4210 return NULL;
4211}
4212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004214
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004215#ifdef __APPLE__
4216
4217/* Simplified UTF-8 decoder using surrogateescape error handler,
4218 used to decode the command line arguments on Mac OS X. */
4219
4220wchar_t*
4221_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4222{
4223 int n;
4224 const char *e;
4225 wchar_t *unicode, *p;
4226
4227 /* Note: size will always be longer than the resulting Unicode
4228 character count */
4229 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4230 PyErr_NoMemory();
4231 return NULL;
4232 }
4233 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4234 if (!unicode)
4235 return NULL;
4236
4237 /* Unpack UTF-8 encoded data */
4238 p = unicode;
4239 e = s + size;
4240 while (s < e) {
4241 Py_UCS4 ch = (unsigned char)*s;
4242
4243 if (ch < 0x80) {
4244 *p++ = (wchar_t)ch;
4245 s++;
4246 continue;
4247 }
4248
4249 n = utf8_code_length[ch];
4250 if (s + n > e) {
4251 goto surrogateescape;
4252 }
4253
4254 switch (n) {
4255 case 0:
4256 case 1:
4257 goto surrogateescape;
4258
4259 case 2:
4260 if ((s[1] & 0xc0) != 0x80)
4261 goto surrogateescape;
4262 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4263 assert ((ch > 0x007F) && (ch <= 0x07FF));
4264 *p++ = (wchar_t)ch;
4265 break;
4266
4267 case 3:
4268 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4269 will result in surrogates in range d800-dfff. Surrogates are
4270 not valid UTF-8 so they are rejected.
4271 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4272 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4273 if ((s[1] & 0xc0) != 0x80 ||
4274 (s[2] & 0xc0) != 0x80 ||
4275 ((unsigned char)s[0] == 0xE0 &&
4276 (unsigned char)s[1] < 0xA0) ||
4277 ((unsigned char)s[0] == 0xED &&
4278 (unsigned char)s[1] > 0x9F)) {
4279
4280 goto surrogateescape;
4281 }
4282 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4283 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004285 break;
4286
4287 case 4:
4288 if ((s[1] & 0xc0) != 0x80 ||
4289 (s[2] & 0xc0) != 0x80 ||
4290 (s[3] & 0xc0) != 0x80 ||
4291 ((unsigned char)s[0] == 0xF0 &&
4292 (unsigned char)s[1] < 0x90) ||
4293 ((unsigned char)s[0] == 0xF4 &&
4294 (unsigned char)s[1] > 0x8F)) {
4295 goto surrogateescape;
4296 }
4297 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4298 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4299 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4300
4301#if SIZEOF_WCHAR_T == 4
4302 *p++ = (wchar_t)ch;
4303#else
4304 /* compute and append the two surrogates: */
4305
4306 /* translate from 10000..10FFFF to 0..FFFF */
4307 ch -= 0x10000;
4308
4309 /* high surrogate = top 10 bits added to D800 */
4310 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4311
4312 /* low surrogate = bottom 10 bits added to DC00 */
4313 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4314#endif
4315 break;
4316 }
4317 s += n;
4318 continue;
4319
4320 surrogateescape:
4321 *p++ = 0xDC00 + ch;
4322 s++;
4323 }
4324 *p = L'\0';
4325 return unicode;
4326}
4327
4328#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330/* Primary internal function which creates utf8 encoded bytes objects.
4331
4332 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004333 and allocate exactly as much space needed at the end. Else allocate the
4334 maximum possible needed (4 result bytes per Unicode character), and return
4335 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004336*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004337PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339{
Tim Peters602f7402002-04-27 18:03:26 +00004340#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004341
Guido van Rossum98297ee2007-11-06 21:34:58 +00004342 Py_ssize_t i; /* index into s of next input byte */
4343 PyObject *result; /* result string object */
4344 char *p; /* next free byte in output buffer */
4345 Py_ssize_t nallocated; /* number of result bytes allocated */
4346 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004347 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004348 PyObject *errorHandler = NULL;
4349 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004350 int kind;
4351 void *data;
4352 Py_ssize_t size;
4353 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4354#if SIZEOF_WCHAR_T == 2
4355 Py_ssize_t wchar_offset = 0;
4356#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 if (!PyUnicode_Check(unicode)) {
4359 PyErr_BadArgument();
4360 return NULL;
4361 }
4362
4363 if (PyUnicode_READY(unicode) == -1)
4364 return NULL;
4365
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004366 if (PyUnicode_UTF8(unicode))
4367 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4368 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369
4370 kind = PyUnicode_KIND(unicode);
4371 data = PyUnicode_DATA(unicode);
4372 size = PyUnicode_GET_LENGTH(unicode);
4373
Tim Peters602f7402002-04-27 18:03:26 +00004374 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
Tim Peters602f7402002-04-27 18:03:26 +00004376 if (size <= MAX_SHORT_UNICHARS) {
4377 /* Write into the stack buffer; nallocated can't overflow.
4378 * At the end, we'll allocate exactly as much heap space as it
4379 * turns out we need.
4380 */
4381 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004382 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004383 p = stackbuf;
4384 }
4385 else {
4386 /* Overallocate on the heap, and give the excess back at the end. */
4387 nallocated = size * 4;
4388 if (nallocated / 4 != size) /* overflow! */
4389 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004390 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004391 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004392 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004393 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004394 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004395
Tim Peters602f7402002-04-27 18:03:26 +00004396 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004397 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004398
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004399 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004400 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004402
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004404 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004405 *p++ = (char)(0xc0 | (ch >> 6));
4406 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004407 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004408 Py_ssize_t newpos;
4409 PyObject *rep;
4410 Py_ssize_t repsize, k, startpos;
4411 startpos = i-1;
4412#if SIZEOF_WCHAR_T == 2
4413 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004414#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004415 rep = unicode_encode_call_errorhandler(
4416 errors, &errorHandler, "utf-8", "surrogates not allowed",
4417 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4418 &exc, startpos, startpos+1, &newpos);
4419 if (!rep)
4420 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004422 if (PyBytes_Check(rep))
4423 repsize = PyBytes_GET_SIZE(rep);
4424 else
4425 repsize = PyUnicode_GET_SIZE(rep);
4426
4427 if (repsize > 4) {
4428 Py_ssize_t offset;
4429
4430 if (result == NULL)
4431 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004432 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004435 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4436 /* integer overflow */
4437 PyErr_NoMemory();
4438 goto error;
4439 }
4440 nallocated += repsize - 4;
4441 if (result != NULL) {
4442 if (_PyBytes_Resize(&result, nallocated) < 0)
4443 goto error;
4444 } else {
4445 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004446 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004447 goto error;
4448 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4449 }
4450 p = PyBytes_AS_STRING(result) + offset;
4451 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004453 if (PyBytes_Check(rep)) {
4454 char *prep = PyBytes_AS_STRING(rep);
4455 for(k = repsize; k > 0; k--)
4456 *p++ = *prep++;
4457 } else /* rep is unicode */ {
4458 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4459 Py_UNICODE c;
4460
4461 for(k=0; k<repsize; k++) {
4462 c = prep[k];
4463 if (0x80 <= c) {
4464 raise_encode_exception(&exc, "utf-8",
4465 PyUnicode_AS_UNICODE(unicode),
4466 size, i-1, i,
4467 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004468 goto error;
4469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004471 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004473 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004474 } else if (ch < 0x10000) {
4475 *p++ = (char)(0xe0 | (ch >> 12));
4476 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4477 *p++ = (char)(0x80 | (ch & 0x3f));
4478 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004479 /* Encode UCS4 Unicode ordinals */
4480 *p++ = (char)(0xf0 | (ch >> 18));
4481 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4482 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4483 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004484#if SIZEOF_WCHAR_T == 2
4485 wchar_offset++;
4486#endif
Tim Peters602f7402002-04-27 18:03:26 +00004487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004489
Guido van Rossum98297ee2007-11-06 21:34:58 +00004490 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004491 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004492 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004493 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004494 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004495 }
4496 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004497 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004498 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004499 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004500 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004502
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004503 Py_XDECREF(errorHandler);
4504 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004505 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004506 error:
4507 Py_XDECREF(errorHandler);
4508 Py_XDECREF(exc);
4509 Py_XDECREF(result);
4510 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004511
Tim Peters602f7402002-04-27 18:03:26 +00004512#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513}
4514
Alexander Belopolsky40018472011-02-26 01:02:56 +00004515PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004516PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4517 Py_ssize_t size,
4518 const char *errors)
4519{
4520 PyObject *v, *unicode;
4521
4522 unicode = PyUnicode_FromUnicode(s, size);
4523 if (unicode == NULL)
4524 return NULL;
4525 v = _PyUnicode_AsUTF8String(unicode, errors);
4526 Py_DECREF(unicode);
4527 return v;
4528}
4529
4530PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004531PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004533 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534}
4535
Walter Dörwald41980ca2007-08-16 21:55:45 +00004536/* --- UTF-32 Codec ------------------------------------------------------- */
4537
4538PyObject *
4539PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 Py_ssize_t size,
4541 const char *errors,
4542 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004543{
4544 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4545}
4546
4547PyObject *
4548PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 Py_ssize_t size,
4550 const char *errors,
4551 int *byteorder,
4552 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004553{
4554 const char *starts = s;
4555 Py_ssize_t startinpos;
4556 Py_ssize_t endinpos;
4557 Py_ssize_t outpos;
4558 PyUnicodeObject *unicode;
4559 Py_UNICODE *p;
4560#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004561 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004562 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004563#else
4564 const int pairs = 0;
4565#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004566 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004567 int bo = 0; /* assume native ordering by default */
4568 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004569 /* Offsets from q for retrieving bytes in the right order. */
4570#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4571 int iorder[] = {0, 1, 2, 3};
4572#else
4573 int iorder[] = {3, 2, 1, 0};
4574#endif
4575 PyObject *errorHandler = NULL;
4576 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004577
Walter Dörwald41980ca2007-08-16 21:55:45 +00004578 q = (unsigned char *)s;
4579 e = q + size;
4580
4581 if (byteorder)
4582 bo = *byteorder;
4583
4584 /* Check for BOM marks (U+FEFF) in the input and adjust current
4585 byte order setting accordingly. In native mode, the leading BOM
4586 mark is skipped, in all other modes, it is copied to the output
4587 stream as-is (giving a ZWNBSP character). */
4588 if (bo == 0) {
4589 if (size >= 4) {
4590 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004592#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 if (bom == 0x0000FEFF) {
4594 q += 4;
4595 bo = -1;
4596 }
4597 else if (bom == 0xFFFE0000) {
4598 q += 4;
4599 bo = 1;
4600 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004601#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 if (bom == 0x0000FEFF) {
4603 q += 4;
4604 bo = 1;
4605 }
4606 else if (bom == 0xFFFE0000) {
4607 q += 4;
4608 bo = -1;
4609 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004610#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004612 }
4613
4614 if (bo == -1) {
4615 /* force LE */
4616 iorder[0] = 0;
4617 iorder[1] = 1;
4618 iorder[2] = 2;
4619 iorder[3] = 3;
4620 }
4621 else if (bo == 1) {
4622 /* force BE */
4623 iorder[0] = 3;
4624 iorder[1] = 2;
4625 iorder[2] = 1;
4626 iorder[3] = 0;
4627 }
4628
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004629 /* On narrow builds we split characters outside the BMP into two
4630 codepoints => count how much extra space we need. */
4631#ifndef Py_UNICODE_WIDE
4632 for (qq = q; qq < e; qq += 4)
4633 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4634 pairs++;
4635#endif
4636
4637 /* This might be one to much, because of a BOM */
4638 unicode = _PyUnicode_New((size+3)/4+pairs);
4639 if (!unicode)
4640 return NULL;
4641 if (size == 0)
4642 return (PyObject *)unicode;
4643
4644 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004645 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004646
Walter Dörwald41980ca2007-08-16 21:55:45 +00004647 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004648 Py_UCS4 ch;
4649 /* remaining bytes at the end? (size should be divisible by 4) */
4650 if (e-q<4) {
4651 if (consumed)
4652 break;
4653 errmsg = "truncated data";
4654 startinpos = ((const char *)q)-starts;
4655 endinpos = ((const char *)e)-starts;
4656 goto utf32Error;
4657 /* The remaining input chars are ignored if the callback
4658 chooses to skip the input */
4659 }
4660 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4661 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004662
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 if (ch >= 0x110000)
4664 {
4665 errmsg = "codepoint not in range(0x110000)";
4666 startinpos = ((const char *)q)-starts;
4667 endinpos = startinpos+4;
4668 goto utf32Error;
4669 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004670#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 if (ch >= 0x10000)
4672 {
4673 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4674 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4675 }
4676 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004677#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 *p++ = ch;
4679 q += 4;
4680 continue;
4681 utf32Error:
4682 outpos = p-PyUnicode_AS_UNICODE(unicode);
4683 if (unicode_decode_call_errorhandler(
4684 errors, &errorHandler,
4685 "utf32", errmsg,
4686 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4687 &unicode, &outpos, &p))
4688 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004689 }
4690
4691 if (byteorder)
4692 *byteorder = bo;
4693
4694 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004696
4697 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004698 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004699 goto onError;
4700
4701 Py_XDECREF(errorHandler);
4702 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004703 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704 Py_DECREF(unicode);
4705 return NULL;
4706 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004707 return (PyObject *)unicode;
4708
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004710 Py_DECREF(unicode);
4711 Py_XDECREF(errorHandler);
4712 Py_XDECREF(exc);
4713 return NULL;
4714}
4715
4716PyObject *
4717PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004718 Py_ssize_t size,
4719 const char *errors,
4720 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004721{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004722 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004724 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004725#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004726 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004727#else
4728 const int pairs = 0;
4729#endif
4730 /* Offsets from p for storing byte pairs in the right order. */
4731#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4732 int iorder[] = {0, 1, 2, 3};
4733#else
4734 int iorder[] = {3, 2, 1, 0};
4735#endif
4736
Benjamin Peterson29060642009-01-31 22:14:21 +00004737#define STORECHAR(CH) \
4738 do { \
4739 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4740 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4741 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4742 p[iorder[0]] = (CH) & 0xff; \
4743 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744 } while(0)
4745
4746 /* In narrow builds we can output surrogate pairs as one codepoint,
4747 so we need less space. */
4748#ifndef Py_UNICODE_WIDE
4749 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4751 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4752 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004753#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004754 nsize = (size - pairs + (byteorder == 0));
4755 bytesize = nsize * 4;
4756 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004759 if (v == NULL)
4760 return NULL;
4761
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004762 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004763 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004765 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004766 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004767
4768 if (byteorder == -1) {
4769 /* force LE */
4770 iorder[0] = 0;
4771 iorder[1] = 1;
4772 iorder[2] = 2;
4773 iorder[3] = 3;
4774 }
4775 else if (byteorder == 1) {
4776 /* force BE */
4777 iorder[0] = 3;
4778 iorder[1] = 2;
4779 iorder[2] = 1;
4780 iorder[3] = 0;
4781 }
4782
4783 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004785#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4787 Py_UCS4 ch2 = *s;
4788 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4789 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4790 s++;
4791 size--;
4792 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004793 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794#endif
4795 STORECHAR(ch);
4796 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004797
4798 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004799 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004800#undef STORECHAR
4801}
4802
Alexander Belopolsky40018472011-02-26 01:02:56 +00004803PyObject *
4804PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004805{
4806 if (!PyUnicode_Check(unicode)) {
4807 PyErr_BadArgument();
4808 return NULL;
4809 }
4810 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 PyUnicode_GET_SIZE(unicode),
4812 NULL,
4813 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004814}
4815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816/* --- UTF-16 Codec ------------------------------------------------------- */
4817
Tim Peters772747b2001-08-09 22:21:55 +00004818PyObject *
4819PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004820 Py_ssize_t size,
4821 const char *errors,
4822 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Walter Dörwald69652032004-09-07 20:24:22 +00004824 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4825}
4826
Antoine Pitrouab868312009-01-10 15:40:25 +00004827/* Two masks for fast checking of whether a C 'long' may contain
4828 UTF16-encoded surrogate characters. This is an efficient heuristic,
4829 assuming that non-surrogate characters with a code point >= 0x8000 are
4830 rare in most input.
4831 FAST_CHAR_MASK is used when the input is in native byte ordering,
4832 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004833*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004834#if (SIZEOF_LONG == 8)
4835# define FAST_CHAR_MASK 0x8000800080008000L
4836# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4837#elif (SIZEOF_LONG == 4)
4838# define FAST_CHAR_MASK 0x80008000L
4839# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4840#else
4841# error C 'long' size should be either 4 or 8!
4842#endif
4843
Walter Dörwald69652032004-09-07 20:24:22 +00004844PyObject *
4845PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004846 Py_ssize_t size,
4847 const char *errors,
4848 int *byteorder,
4849 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004850{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 Py_ssize_t startinpos;
4853 Py_ssize_t endinpos;
4854 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 PyUnicodeObject *unicode;
4856 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004857 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004858 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004859 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004860 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004861 /* Offsets from q for retrieving byte pairs in the right order. */
4862#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4863 int ihi = 1, ilo = 0;
4864#else
4865 int ihi = 0, ilo = 1;
4866#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 PyObject *errorHandler = NULL;
4868 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869
4870 /* Note: size will always be longer than the resulting Unicode
4871 character count */
4872 unicode = _PyUnicode_New(size);
4873 if (!unicode)
4874 return NULL;
4875 if (size == 0)
4876 return (PyObject *)unicode;
4877
4878 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004880 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004881 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882
4883 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004884 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004886 /* Check for BOM marks (U+FEFF) in the input and adjust current
4887 byte order setting accordingly. In native mode, the leading BOM
4888 mark is skipped, in all other modes, it is copied to the output
4889 stream as-is (giving a ZWNBSP character). */
4890 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004891 if (size >= 2) {
4892 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004893#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 if (bom == 0xFEFF) {
4895 q += 2;
4896 bo = -1;
4897 }
4898 else if (bom == 0xFFFE) {
4899 q += 2;
4900 bo = 1;
4901 }
Tim Petersced69f82003-09-16 20:30:58 +00004902#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 if (bom == 0xFEFF) {
4904 q += 2;
4905 bo = 1;
4906 }
4907 else if (bom == 0xFFFE) {
4908 q += 2;
4909 bo = -1;
4910 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004911#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914
Tim Peters772747b2001-08-09 22:21:55 +00004915 if (bo == -1) {
4916 /* force LE */
4917 ihi = 1;
4918 ilo = 0;
4919 }
4920 else if (bo == 1) {
4921 /* force BE */
4922 ihi = 0;
4923 ilo = 1;
4924 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004925#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4926 native_ordering = ilo < ihi;
4927#else
4928 native_ordering = ilo > ihi;
4929#endif
Tim Peters772747b2001-08-09 22:21:55 +00004930
Antoine Pitrouab868312009-01-10 15:40:25 +00004931 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004932 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004934 /* First check for possible aligned read of a C 'long'. Unaligned
4935 reads are more expensive, better to defer to another iteration. */
4936 if (!((size_t) q & LONG_PTR_MASK)) {
4937 /* Fast path for runs of non-surrogate chars. */
4938 register const unsigned char *_q = q;
4939 Py_UNICODE *_p = p;
4940 if (native_ordering) {
4941 /* Native ordering is simple: as long as the input cannot
4942 possibly contain a surrogate char, do an unrolled copy
4943 of several 16-bit code points to the target object.
4944 The non-surrogate check is done on several input bytes
4945 at a time (as many as a C 'long' can contain). */
4946 while (_q < aligned_end) {
4947 unsigned long data = * (unsigned long *) _q;
4948 if (data & FAST_CHAR_MASK)
4949 break;
4950 _p[0] = ((unsigned short *) _q)[0];
4951 _p[1] = ((unsigned short *) _q)[1];
4952#if (SIZEOF_LONG == 8)
4953 _p[2] = ((unsigned short *) _q)[2];
4954 _p[3] = ((unsigned short *) _q)[3];
4955#endif
4956 _q += SIZEOF_LONG;
4957 _p += SIZEOF_LONG / 2;
4958 }
4959 }
4960 else {
4961 /* Byteswapped ordering is similar, but we must decompose
4962 the copy bytewise, and take care of zero'ing out the
4963 upper bytes if the target object is in 32-bit units
4964 (that is, in UCS-4 builds). */
4965 while (_q < aligned_end) {
4966 unsigned long data = * (unsigned long *) _q;
4967 if (data & SWAPPED_FAST_CHAR_MASK)
4968 break;
4969 /* Zero upper bytes in UCS-4 builds */
4970#if (Py_UNICODE_SIZE > 2)
4971 _p[0] = 0;
4972 _p[1] = 0;
4973#if (SIZEOF_LONG == 8)
4974 _p[2] = 0;
4975 _p[3] = 0;
4976#endif
4977#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004978 /* Issue #4916; UCS-4 builds on big endian machines must
4979 fill the two last bytes of each 4-byte unit. */
4980#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4981# define OFF 2
4982#else
4983# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004984#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004985 ((unsigned char *) _p)[OFF + 1] = _q[0];
4986 ((unsigned char *) _p)[OFF + 0] = _q[1];
4987 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4988 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4989#if (SIZEOF_LONG == 8)
4990 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4991 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4992 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4993 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4994#endif
4995#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004996 _q += SIZEOF_LONG;
4997 _p += SIZEOF_LONG / 2;
4998 }
4999 }
5000 p = _p;
5001 q = _q;
5002 if (q >= e)
5003 break;
5004 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006
Benjamin Peterson14339b62009-01-31 16:36:08 +00005007 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005008
5009 if (ch < 0xD800 || ch > 0xDFFF) {
5010 *p++ = ch;
5011 continue;
5012 }
5013
5014 /* UTF-16 code pair: */
5015 if (q > e) {
5016 errmsg = "unexpected end of data";
5017 startinpos = (((const char *)q) - 2) - starts;
5018 endinpos = ((const char *)e) + 1 - starts;
5019 goto utf16Error;
5020 }
5021 if (0xD800 <= ch && ch <= 0xDBFF) {
5022 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5023 q += 2;
5024 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005025#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 *p++ = ch;
5027 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005028#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005030#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 continue;
5032 }
5033 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005034 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 startinpos = (((const char *)q)-4)-starts;
5036 endinpos = startinpos+2;
5037 goto utf16Error;
5038 }
5039
Benjamin Peterson14339b62009-01-31 16:36:08 +00005040 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005041 errmsg = "illegal encoding";
5042 startinpos = (((const char *)q)-2)-starts;
5043 endinpos = startinpos+2;
5044 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005045
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 utf16Error:
5047 outpos = p - PyUnicode_AS_UNICODE(unicode);
5048 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005049 errors,
5050 &errorHandler,
5051 "utf16", errmsg,
5052 &starts,
5053 (const char **)&e,
5054 &startinpos,
5055 &endinpos,
5056 &exc,
5057 (const char **)&q,
5058 &unicode,
5059 &outpos,
5060 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005063 /* remaining byte at the end? (size should be even) */
5064 if (e == q) {
5065 if (!consumed) {
5066 errmsg = "truncated data";
5067 startinpos = ((const char *)q) - starts;
5068 endinpos = ((const char *)e) + 1 - starts;
5069 outpos = p - PyUnicode_AS_UNICODE(unicode);
5070 if (unicode_decode_call_errorhandler(
5071 errors,
5072 &errorHandler,
5073 "utf16", errmsg,
5074 &starts,
5075 (const char **)&e,
5076 &startinpos,
5077 &endinpos,
5078 &exc,
5079 (const char **)&q,
5080 &unicode,
5081 &outpos,
5082 &p))
5083 goto onError;
5084 /* The remaining input chars are ignored if the callback
5085 chooses to skip the input */
5086 }
5087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088
5089 if (byteorder)
5090 *byteorder = bo;
5091
Walter Dörwald69652032004-09-07 20:24:22 +00005092 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005094
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005096 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 goto onError;
5098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005099 Py_XDECREF(errorHandler);
5100 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005101 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005102 Py_DECREF(unicode);
5103 return NULL;
5104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 return (PyObject *)unicode;
5106
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 Py_XDECREF(errorHandler);
5110 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 return NULL;
5112}
5113
Antoine Pitrouab868312009-01-10 15:40:25 +00005114#undef FAST_CHAR_MASK
5115#undef SWAPPED_FAST_CHAR_MASK
5116
Tim Peters772747b2001-08-09 22:21:55 +00005117PyObject *
5118PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 Py_ssize_t size,
5120 const char *errors,
5121 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005123 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005124 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005125 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005126#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005127 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005128#else
5129 const int pairs = 0;
5130#endif
Tim Peters772747b2001-08-09 22:21:55 +00005131 /* Offsets from p for storing byte pairs in the right order. */
5132#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5133 int ihi = 1, ilo = 0;
5134#else
5135 int ihi = 0, ilo = 1;
5136#endif
5137
Benjamin Peterson29060642009-01-31 22:14:21 +00005138#define STORECHAR(CH) \
5139 do { \
5140 p[ihi] = ((CH) >> 8) & 0xff; \
5141 p[ilo] = (CH) & 0xff; \
5142 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005143 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005145#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005146 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 if (s[i] >= 0x10000)
5148 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005149#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005150 /* 2 * (size + pairs + (byteorder == 0)) */
5151 if (size > PY_SSIZE_T_MAX ||
5152 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005154 nsize = size + pairs + (byteorder == 0);
5155 bytesize = nsize * 2;
5156 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005158 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 if (v == NULL)
5160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005162 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005165 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005166 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005167
5168 if (byteorder == -1) {
5169 /* force LE */
5170 ihi = 1;
5171 ilo = 0;
5172 }
5173 else if (byteorder == 1) {
5174 /* force BE */
5175 ihi = 0;
5176 ilo = 1;
5177 }
5178
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005179 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 Py_UNICODE ch = *s++;
5181 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005182#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 if (ch >= 0x10000) {
5184 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5185 ch = 0xD800 | ((ch-0x10000) >> 10);
5186 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005187#endif
Tim Peters772747b2001-08-09 22:21:55 +00005188 STORECHAR(ch);
5189 if (ch2)
5190 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005191 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005192
5193 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005194 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005195#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196}
5197
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyObject *
5199PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200{
5201 if (!PyUnicode_Check(unicode)) {
5202 PyErr_BadArgument();
5203 return NULL;
5204 }
5205 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 PyUnicode_GET_SIZE(unicode),
5207 NULL,
5208 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209}
5210
5211/* --- Unicode Escape Codec ----------------------------------------------- */
5212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005213/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5214 if all the escapes in the string make it still a valid ASCII string.
5215 Returns -1 if any escapes were found which cause the string to
5216 pop out of ASCII range. Otherwise returns the length of the
5217 required buffer to hold the string.
5218 */
5219Py_ssize_t
5220length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5221{
5222 const unsigned char *p = (const unsigned char *)s;
5223 const unsigned char *end = p + size;
5224 Py_ssize_t length = 0;
5225
5226 if (size < 0)
5227 return -1;
5228
5229 for (; p < end; ++p) {
5230 if (*p > 127) {
5231 /* Non-ASCII */
5232 return -1;
5233 }
5234 else if (*p != '\\') {
5235 /* Normal character */
5236 ++length;
5237 }
5238 else {
5239 /* Backslash-escape, check next char */
5240 ++p;
5241 /* Escape sequence reaches till end of string or
5242 non-ASCII follow-up. */
5243 if (p >= end || *p > 127)
5244 return -1;
5245 switch (*p) {
5246 case '\n':
5247 /* backslash + \n result in zero characters */
5248 break;
5249 case '\\': case '\'': case '\"':
5250 case 'b': case 'f': case 't':
5251 case 'n': case 'r': case 'v': case 'a':
5252 ++length;
5253 break;
5254 case '0': case '1': case '2': case '3':
5255 case '4': case '5': case '6': case '7':
5256 case 'x': case 'u': case 'U': case 'N':
5257 /* these do not guarantee ASCII characters */
5258 return -1;
5259 default:
5260 /* count the backslash + the other character */
5261 length += 2;
5262 }
5263 }
5264 }
5265 return length;
5266}
5267
5268/* Similar to PyUnicode_WRITE but either write into wstr field
5269 or treat string as ASCII. */
5270#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5271 do { \
5272 if ((kind) != PyUnicode_WCHAR_KIND) \
5273 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5274 else \
5275 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5276 } while (0)
5277
5278#define WRITE_WSTR(buf, index, value) \
5279 assert(kind == PyUnicode_WCHAR_KIND), \
5280 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5281
5282
Fredrik Lundh06d12682001-01-24 07:59:11 +00005283static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005284
Alexander Belopolsky40018472011-02-26 01:02:56 +00005285PyObject *
5286PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005287 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005290 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291 Py_ssize_t startinpos;
5292 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005293 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005295 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005297 char* message;
5298 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005299 PyObject *errorHandler = NULL;
5300 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005301 Py_ssize_t ascii_length;
5302 Py_ssize_t i;
5303 int kind;
5304 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005306 ascii_length = length_of_escaped_ascii_string(s, size);
5307
5308 /* After length_of_escaped_ascii_string() there are two alternatives,
5309 either the string is pure ASCII with named escapes like \n, etc.
5310 and we determined it's exact size (common case)
5311 or it contains \x, \u, ... escape sequences. then we create a
5312 legacy wchar string and resize it at the end of this function. */
5313 if (ascii_length >= 0) {
5314 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5315 if (!v)
5316 goto onError;
5317 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5318 kind = PyUnicode_1BYTE_KIND;
5319 data = PyUnicode_DATA(v);
5320 }
5321 else {
5322 /* Escaped strings will always be longer than the resulting
5323 Unicode string, so we start with size here and then reduce the
5324 length after conversion to the true value.
5325 (but if the error callback returns a long replacement string
5326 we'll have to allocate more space) */
5327 v = _PyUnicode_New(size);
5328 if (!v)
5329 goto onError;
5330 kind = PyUnicode_WCHAR_KIND;
5331 data = PyUnicode_AS_UNICODE(v);
5332 }
5333
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 if (size == 0)
5335 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005336 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005338
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 while (s < end) {
5340 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005341 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005344 if (kind == PyUnicode_WCHAR_KIND) {
5345 assert(i < _PyUnicode_WSTR_LENGTH(v));
5346 }
5347 else {
5348 /* The only case in which i == ascii_length is a backslash
5349 followed by a newline. */
5350 assert(i <= ascii_length);
5351 }
5352
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 /* Non-escape characters are interpreted as Unicode ordinals */
5354 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005355 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356 continue;
5357 }
5358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005359 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 /* \ - Escapes */
5361 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005362 c = *s++;
5363 if (s > end)
5364 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005365
5366 if (kind == PyUnicode_WCHAR_KIND) {
5367 assert(i < _PyUnicode_WSTR_LENGTH(v));
5368 }
5369 else {
5370 /* The only case in which i == ascii_length is a backslash
5371 followed by a newline. */
5372 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5373 }
5374
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005375 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005379 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5380 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5381 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5382 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5383 /* FF */
5384 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5385 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5386 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5387 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5388 /* VT */
5389 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5390 /* BEL, not classic C */
5391 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 case '0': case '1': case '2': case '3':
5395 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005396 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005397 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005398 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005399 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005400 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005402 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 break;
5404
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 /* hex escapes */
5406 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005408 digits = 2;
5409 message = "truncated \\xXX escape";
5410 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005414 digits = 4;
5415 message = "truncated \\uXXXX escape";
5416 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005419 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005420 digits = 8;
5421 message = "truncated \\UXXXXXXXX escape";
5422 hexescape:
5423 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 if (s+digits>end) {
5426 endinpos = size;
5427 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 errors, &errorHandler,
5429 "unicodeescape", "end of string in escape sequence",
5430 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005432 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005434 goto nextByte;
5435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436 for (j = 0; j < digits; ++j) {
5437 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005438 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 endinpos = (s+j+1)-starts;
5440 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 errors, &errorHandler,
5443 "unicodeescape", message,
5444 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005445 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005446 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005449 }
5450 chr = (chr<<4) & ~0xF;
5451 if (c >= '0' && c <= '9')
5452 chr += c - '0';
5453 else if (c >= 'a' && c <= 'f')
5454 chr += 10 + c - 'a';
5455 else
5456 chr += 10 + c - 'A';
5457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005458 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005459 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 /* _decoding_error will have already written into the
5461 target buffer. */
5462 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005463 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005464 /* when we get here, chr is a 32-bit unicode character */
5465 if (chr <= 0xffff)
5466 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005468 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005469 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005470 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005471#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005473#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005474 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005475 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5476 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005477#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005478 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005479 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005480 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005481 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 errors, &errorHandler,
5483 "unicodeescape", "illegal Unicode character",
5484 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005485 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005486 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005487 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005488 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005489 break;
5490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 case 'N':
5493 message = "malformed \\N character escape";
5494 if (ucnhash_CAPI == NULL) {
5495 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005496 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5497 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005498 if (ucnhash_CAPI == NULL)
5499 goto ucnhashError;
5500 }
5501 if (*s == '{') {
5502 const char *start = s+1;
5503 /* look for the closing brace */
5504 while (*s != '}' && s < end)
5505 s++;
5506 if (s > start && s < end && *s == '}') {
5507 /* found a name. look it up in the unicode database */
5508 message = "unknown Unicode character name";
5509 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5511 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005512 goto store;
5513 }
5514 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 errors, &errorHandler,
5519 "unicodeescape", message,
5520 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005522 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005523 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005524 break;
5525
5526 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005527 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005529 message = "\\ at end of string";
5530 s--;
5531 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 errors, &errorHandler,
5535 "unicodeescape", message,
5536 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005537 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005538 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005539 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005540 }
5541 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5543 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005544 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005548 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 /* Ensure the length prediction worked in case of ASCII strings */
5551 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5552
Victor Stinnerfe226c02011-10-03 03:52:20 +02005553 if (kind == PyUnicode_WCHAR_KIND)
5554 {
5555 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5556 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005557 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005558 Py_XDECREF(errorHandler);
5559 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005560 if (_PyUnicode_READY_REPLACE(&v)) {
5561 Py_DECREF(v);
5562 return NULL;
5563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005565
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005567 PyErr_SetString(
5568 PyExc_UnicodeError,
5569 "\\N escapes not supported (can't load unicodedata module)"
5570 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005571 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 Py_XDECREF(errorHandler);
5573 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005574 return NULL;
5575
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 Py_XDECREF(errorHandler);
5579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 return NULL;
5581}
5582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583#undef WRITE_ASCII_OR_WSTR
5584#undef WRITE_WSTR
5585
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586/* Return a Unicode-Escape string version of the Unicode object.
5587
5588 If quotes is true, the string is enclosed in u"" or u'' quotes as
5589 appropriate.
5590
5591*/
5592
Walter Dörwald79e913e2007-05-12 11:08:06 +00005593static const char *hexdigits = "0123456789abcdef";
5594
Alexander Belopolsky40018472011-02-26 01:02:56 +00005595PyObject *
5596PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005597 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005599 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005602#ifdef Py_UNICODE_WIDE
5603 const Py_ssize_t expandsize = 10;
5604#else
5605 const Py_ssize_t expandsize = 6;
5606#endif
5607
Thomas Wouters89f507f2006-12-13 04:49:30 +00005608 /* XXX(nnorwitz): rather than over-allocating, it would be
5609 better to choose a different scheme. Perhaps scan the
5610 first N-chars of the string and allocate based on that size.
5611 */
5612 /* Initial allocation is based on the longest-possible unichr
5613 escape.
5614
5615 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5616 unichr, so in this case it's the longest unichr escape. In
5617 narrow (UTF-16) builds this is five chars per source unichr
5618 since there are two unichrs in the surrogate pair, so in narrow
5619 (UTF-16) builds it's not the longest unichr escape.
5620
5621 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5622 so in the narrow (UTF-16) build case it's the longest unichr
5623 escape.
5624 */
5625
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005626 if (size == 0)
5627 return PyBytes_FromStringAndSize(NULL, 0);
5628
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005629 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005631
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005632 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 2
5634 + expandsize*size
5635 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 if (repr == NULL)
5637 return NULL;
5638
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005639 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 while (size-- > 0) {
5642 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005643
Walter Dörwald79e913e2007-05-12 11:08:06 +00005644 /* Escape backslashes */
5645 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 *p++ = '\\';
5647 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005648 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005649 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005650
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005651#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005652 /* Map 21-bit characters to '\U00xxxxxx' */
5653 else if (ch >= 0x10000) {
5654 *p++ = '\\';
5655 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005656 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5657 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5658 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5659 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5660 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5661 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5662 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5663 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005665 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005666#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5668 else if (ch >= 0xD800 && ch < 0xDC00) {
5669 Py_UNICODE ch2;
5670 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005671
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 ch2 = *s++;
5673 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005674 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5676 *p++ = '\\';
5677 *p++ = 'U';
5678 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5679 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5680 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5681 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5682 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5683 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5684 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5685 *p++ = hexdigits[ucs & 0x0000000F];
5686 continue;
5687 }
5688 /* Fall through: isolated surrogates are copied as-is */
5689 s--;
5690 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005691 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005692#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005695 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 *p++ = '\\';
5697 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005698 *p++ = hexdigits[(ch >> 12) & 0x000F];
5699 *p++ = hexdigits[(ch >> 8) & 0x000F];
5700 *p++ = hexdigits[(ch >> 4) & 0x000F];
5701 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005703
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005704 /* Map special whitespace to '\t', \n', '\r' */
5705 else if (ch == '\t') {
5706 *p++ = '\\';
5707 *p++ = 't';
5708 }
5709 else if (ch == '\n') {
5710 *p++ = '\\';
5711 *p++ = 'n';
5712 }
5713 else if (ch == '\r') {
5714 *p++ = '\\';
5715 *p++ = 'r';
5716 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005717
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005718 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005719 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005721 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005722 *p++ = hexdigits[(ch >> 4) & 0x000F];
5723 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005724 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005725
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 /* Copy everything else as-is */
5727 else
5728 *p++ = (char) ch;
5729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005731 assert(p - PyBytes_AS_STRING(repr) > 0);
5732 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5733 return NULL;
5734 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735}
5736
Alexander Belopolsky40018472011-02-26 01:02:56 +00005737PyObject *
5738PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005740 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 if (!PyUnicode_Check(unicode)) {
5742 PyErr_BadArgument();
5743 return NULL;
5744 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005745 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5746 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005747 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748}
5749
5750/* --- Raw Unicode Escape Codec ------------------------------------------- */
5751
Alexander Belopolsky40018472011-02-26 01:02:56 +00005752PyObject *
5753PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005754 Py_ssize_t size,
5755 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005758 Py_ssize_t startinpos;
5759 Py_ssize_t endinpos;
5760 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 const char *end;
5764 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 PyObject *errorHandler = NULL;
5766 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005767
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 /* Escaped strings will always be longer than the resulting
5769 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 length after conversion to the true value. (But decoding error
5771 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 v = _PyUnicode_New(size);
5773 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 end = s + size;
5779 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 unsigned char c;
5781 Py_UCS4 x;
5782 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005783 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 /* Non-escape characters are interpreted as Unicode ordinals */
5786 if (*s != '\\') {
5787 *p++ = (unsigned char)*s++;
5788 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005789 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 startinpos = s-starts;
5791
5792 /* \u-escapes are only interpreted iff the number of leading
5793 backslashes if odd */
5794 bs = s;
5795 for (;s < end;) {
5796 if (*s != '\\')
5797 break;
5798 *p++ = (unsigned char)*s++;
5799 }
5800 if (((s - bs) & 1) == 0 ||
5801 s >= end ||
5802 (*s != 'u' && *s != 'U')) {
5803 continue;
5804 }
5805 p--;
5806 count = *s=='u' ? 4 : 8;
5807 s++;
5808
5809 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5810 outpos = p-PyUnicode_AS_UNICODE(v);
5811 for (x = 0, i = 0; i < count; ++i, ++s) {
5812 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005813 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 endinpos = s-starts;
5815 if (unicode_decode_call_errorhandler(
5816 errors, &errorHandler,
5817 "rawunicodeescape", "truncated \\uXXXX",
5818 &starts, &end, &startinpos, &endinpos, &exc, &s,
5819 &v, &outpos, &p))
5820 goto onError;
5821 goto nextByte;
5822 }
5823 x = (x<<4) & ~0xF;
5824 if (c >= '0' && c <= '9')
5825 x += c - '0';
5826 else if (c >= 'a' && c <= 'f')
5827 x += 10 + c - 'a';
5828 else
5829 x += 10 + c - 'A';
5830 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005831 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* UCS-2 character */
5833 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005834 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 /* UCS-4 character. Either store directly, or as
5836 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005837#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005839#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 x -= 0x10000L;
5841 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5842 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005843#endif
5844 } else {
5845 endinpos = s-starts;
5846 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005847 if (unicode_decode_call_errorhandler(
5848 errors, &errorHandler,
5849 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 &starts, &end, &startinpos, &endinpos, &exc, &s,
5851 &v, &outpos, &p))
5852 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005853 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 nextByte:
5855 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005857 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005861 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005862 Py_DECREF(v);
5863 return NULL;
5864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005866
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 Py_XDECREF(errorHandler);
5870 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 return NULL;
5872}
5873
Alexander Belopolsky40018472011-02-26 01:02:56 +00005874PyObject *
5875PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005876 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005878 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 char *p;
5880 char *q;
5881
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005882#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005883 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005884#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005885 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005886#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005887
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005888 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005890
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005891 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 if (repr == NULL)
5893 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005894 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005895 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005897 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 while (size-- > 0) {
5899 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005900#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 /* Map 32-bit characters to '\Uxxxxxxxx' */
5902 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005903 *p++ = '\\';
5904 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005905 *p++ = hexdigits[(ch >> 28) & 0xf];
5906 *p++ = hexdigits[(ch >> 24) & 0xf];
5907 *p++ = hexdigits[(ch >> 20) & 0xf];
5908 *p++ = hexdigits[(ch >> 16) & 0xf];
5909 *p++ = hexdigits[(ch >> 12) & 0xf];
5910 *p++ = hexdigits[(ch >> 8) & 0xf];
5911 *p++ = hexdigits[(ch >> 4) & 0xf];
5912 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005913 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005914 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005915#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5917 if (ch >= 0xD800 && ch < 0xDC00) {
5918 Py_UNICODE ch2;
5919 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005920
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 ch2 = *s++;
5922 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005923 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5925 *p++ = '\\';
5926 *p++ = 'U';
5927 *p++ = hexdigits[(ucs >> 28) & 0xf];
5928 *p++ = hexdigits[(ucs >> 24) & 0xf];
5929 *p++ = hexdigits[(ucs >> 20) & 0xf];
5930 *p++ = hexdigits[(ucs >> 16) & 0xf];
5931 *p++ = hexdigits[(ucs >> 12) & 0xf];
5932 *p++ = hexdigits[(ucs >> 8) & 0xf];
5933 *p++ = hexdigits[(ucs >> 4) & 0xf];
5934 *p++ = hexdigits[ucs & 0xf];
5935 continue;
5936 }
5937 /* Fall through: isolated surrogates are copied as-is */
5938 s--;
5939 size++;
5940 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005941#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 /* Map 16-bit characters to '\uxxxx' */
5943 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 *p++ = '\\';
5945 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005946 *p++ = hexdigits[(ch >> 12) & 0xf];
5947 *p++ = hexdigits[(ch >> 8) & 0xf];
5948 *p++ = hexdigits[(ch >> 4) & 0xf];
5949 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 /* Copy everything else as-is */
5952 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 *p++ = (char) ch;
5954 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005955 size = p - q;
5956
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005957 assert(size > 0);
5958 if (_PyBytes_Resize(&repr, size) < 0)
5959 return NULL;
5960 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961}
5962
Alexander Belopolsky40018472011-02-26 01:02:56 +00005963PyObject *
5964PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005966 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005968 PyErr_BadArgument();
5969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005971 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5972 PyUnicode_GET_SIZE(unicode));
5973
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005974 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975}
5976
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005977/* --- Unicode Internal Codec ------------------------------------------- */
5978
Alexander Belopolsky40018472011-02-26 01:02:56 +00005979PyObject *
5980_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005981 Py_ssize_t size,
5982 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005983{
5984 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005985 Py_ssize_t startinpos;
5986 Py_ssize_t endinpos;
5987 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005988 PyUnicodeObject *v;
5989 Py_UNICODE *p;
5990 const char *end;
5991 const char *reason;
5992 PyObject *errorHandler = NULL;
5993 PyObject *exc = NULL;
5994
Neal Norwitzd43069c2006-01-08 01:12:10 +00005995#ifdef Py_UNICODE_WIDE
5996 Py_UNICODE unimax = PyUnicode_GetMax();
5997#endif
5998
Thomas Wouters89f507f2006-12-13 04:49:30 +00005999 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006000 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6001 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006002 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006003 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6004 as string was created with the old API. */
6005 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006007 p = PyUnicode_AS_UNICODE(v);
6008 end = s + size;
6009
6010 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006011 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006012 /* We have to sanity check the raw data, otherwise doom looms for
6013 some malformed UCS-4 data. */
6014 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006015#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006016 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006017#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006018 end-s < Py_UNICODE_SIZE
6019 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006021 startinpos = s - starts;
6022 if (end-s < Py_UNICODE_SIZE) {
6023 endinpos = end-starts;
6024 reason = "truncated input";
6025 }
6026 else {
6027 endinpos = s - starts + Py_UNICODE_SIZE;
6028 reason = "illegal code point (> 0x10FFFF)";
6029 }
6030 outpos = p - PyUnicode_AS_UNICODE(v);
6031 if (unicode_decode_call_errorhandler(
6032 errors, &errorHandler,
6033 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006034 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006035 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006036 goto onError;
6037 }
6038 }
6039 else {
6040 p++;
6041 s += Py_UNICODE_SIZE;
6042 }
6043 }
6044
Victor Stinnerfe226c02011-10-03 03:52:20 +02006045 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006046 goto onError;
6047 Py_XDECREF(errorHandler);
6048 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006049 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006050 Py_DECREF(v);
6051 return NULL;
6052 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006053 return (PyObject *)v;
6054
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006056 Py_XDECREF(v);
6057 Py_XDECREF(errorHandler);
6058 Py_XDECREF(exc);
6059 return NULL;
6060}
6061
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062/* --- Latin-1 Codec ------------------------------------------------------ */
6063
Alexander Belopolsky40018472011-02-26 01:02:56 +00006064PyObject *
6065PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006066 Py_ssize_t size,
6067 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006070 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071}
6072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074static void
6075make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006076 const char *encoding,
6077 const Py_UNICODE *unicode, Py_ssize_t size,
6078 Py_ssize_t startpos, Py_ssize_t endpos,
6079 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 *exceptionObject = PyUnicodeEncodeError_Create(
6083 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 }
6085 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6087 goto onError;
6088 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6089 goto onError;
6090 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6091 goto onError;
6092 return;
6093 onError:
6094 Py_DECREF(*exceptionObject);
6095 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 }
6097}
6098
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006100static void
6101raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006102 const char *encoding,
6103 const Py_UNICODE *unicode, Py_ssize_t size,
6104 Py_ssize_t startpos, Py_ssize_t endpos,
6105 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106{
6107 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111}
6112
6113/* error handling callback helper:
6114 build arguments, call the callback and check the arguments,
6115 put the result into newpos and return the replacement string, which
6116 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006117static PyObject *
6118unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006119 PyObject **errorHandler,
6120 const char *encoding, const char *reason,
6121 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6122 Py_ssize_t startpos, Py_ssize_t endpos,
6123 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006125 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126
6127 PyObject *restuple;
6128 PyObject *resunicode;
6129
6130 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 }
6135
6136 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140
6141 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 Py_DECREF(restuple);
6148 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006150 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 &resunicode, newpos)) {
6152 Py_DECREF(restuple);
6153 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006155 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6156 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6157 Py_DECREF(restuple);
6158 return NULL;
6159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006162 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6164 Py_DECREF(restuple);
6165 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006166 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 Py_INCREF(resunicode);
6168 Py_DECREF(restuple);
6169 return resunicode;
6170}
6171
Alexander Belopolsky40018472011-02-26 01:02:56 +00006172static PyObject *
6173unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006174 Py_ssize_t size,
6175 const char *errors,
6176 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177{
6178 /* output object */
6179 PyObject *res;
6180 /* pointers to the beginning and end+1 of input */
6181 const Py_UNICODE *startp = p;
6182 const Py_UNICODE *endp = p + size;
6183 /* pointer to the beginning of the unencodable characters */
6184 /* const Py_UNICODE *badp = NULL; */
6185 /* pointer into the output */
6186 char *str;
6187 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006188 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006189 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6190 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 PyObject *errorHandler = NULL;
6192 PyObject *exc = NULL;
6193 /* the following variable is used for caching string comparisons
6194 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6195 int known_errorHandler = -1;
6196
6197 /* allocate enough for a simple encoding without
6198 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006199 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006200 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006201 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006203 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006204 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006205 ressize = size;
6206
6207 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 /* can we encode this? */
6211 if (c<limit) {
6212 /* no overflow check, because we know that the space is enough */
6213 *str++ = (char)c;
6214 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006215 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 else {
6217 Py_ssize_t unicodepos = p-startp;
6218 Py_ssize_t requiredsize;
6219 PyObject *repunicode;
6220 Py_ssize_t repsize;
6221 Py_ssize_t newpos;
6222 Py_ssize_t respos;
6223 Py_UNICODE *uni2;
6224 /* startpos for collecting unencodable chars */
6225 const Py_UNICODE *collstart = p;
6226 const Py_UNICODE *collend = p;
6227 /* find all unecodable characters */
6228 while ((collend < endp) && ((*collend)>=limit))
6229 ++collend;
6230 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6231 if (known_errorHandler==-1) {
6232 if ((errors==NULL) || (!strcmp(errors, "strict")))
6233 known_errorHandler = 1;
6234 else if (!strcmp(errors, "replace"))
6235 known_errorHandler = 2;
6236 else if (!strcmp(errors, "ignore"))
6237 known_errorHandler = 3;
6238 else if (!strcmp(errors, "xmlcharrefreplace"))
6239 known_errorHandler = 4;
6240 else
6241 known_errorHandler = 0;
6242 }
6243 switch (known_errorHandler) {
6244 case 1: /* strict */
6245 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6246 goto onError;
6247 case 2: /* replace */
6248 while (collstart++<collend)
6249 *str++ = '?'; /* fall through */
6250 case 3: /* ignore */
6251 p = collend;
6252 break;
6253 case 4: /* xmlcharrefreplace */
6254 respos = str - PyBytes_AS_STRING(res);
6255 /* determine replacement size (temporarily (mis)uses p) */
6256 for (p = collstart, repsize = 0; p < collend; ++p) {
6257 if (*p<10)
6258 repsize += 2+1+1;
6259 else if (*p<100)
6260 repsize += 2+2+1;
6261 else if (*p<1000)
6262 repsize += 2+3+1;
6263 else if (*p<10000)
6264 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006265#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 else
6267 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006268#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 else if (*p<100000)
6270 repsize += 2+5+1;
6271 else if (*p<1000000)
6272 repsize += 2+6+1;
6273 else
6274 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006275#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 }
6277 requiredsize = respos+repsize+(endp-collend);
6278 if (requiredsize > ressize) {
6279 if (requiredsize<2*ressize)
6280 requiredsize = 2*ressize;
6281 if (_PyBytes_Resize(&res, requiredsize))
6282 goto onError;
6283 str = PyBytes_AS_STRING(res) + respos;
6284 ressize = requiredsize;
6285 }
6286 /* generate replacement (temporarily (mis)uses p) */
6287 for (p = collstart; p < collend; ++p) {
6288 str += sprintf(str, "&#%d;", (int)*p);
6289 }
6290 p = collend;
6291 break;
6292 default:
6293 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6294 encoding, reason, startp, size, &exc,
6295 collstart-startp, collend-startp, &newpos);
6296 if (repunicode == NULL)
6297 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006298 if (PyBytes_Check(repunicode)) {
6299 /* Directly copy bytes result to output. */
6300 repsize = PyBytes_Size(repunicode);
6301 if (repsize > 1) {
6302 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006303 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006304 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6305 Py_DECREF(repunicode);
6306 goto onError;
6307 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006308 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006309 ressize += repsize-1;
6310 }
6311 memcpy(str, PyBytes_AsString(repunicode), repsize);
6312 str += repsize;
6313 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006314 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006315 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 /* need more space? (at least enough for what we
6318 have+the replacement+the rest of the string, so
6319 we won't have to check space for encodable characters) */
6320 respos = str - PyBytes_AS_STRING(res);
6321 repsize = PyUnicode_GET_SIZE(repunicode);
6322 requiredsize = respos+repsize+(endp-collend);
6323 if (requiredsize > ressize) {
6324 if (requiredsize<2*ressize)
6325 requiredsize = 2*ressize;
6326 if (_PyBytes_Resize(&res, requiredsize)) {
6327 Py_DECREF(repunicode);
6328 goto onError;
6329 }
6330 str = PyBytes_AS_STRING(res) + respos;
6331 ressize = requiredsize;
6332 }
6333 /* check if there is anything unencodable in the replacement
6334 and copy it to the output */
6335 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6336 c = *uni2;
6337 if (c >= limit) {
6338 raise_encode_exception(&exc, encoding, startp, size,
6339 unicodepos, unicodepos+1, reason);
6340 Py_DECREF(repunicode);
6341 goto onError;
6342 }
6343 *str = (char)c;
6344 }
6345 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006346 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006348 }
6349 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006350 /* Resize if we allocated to much */
6351 size = str - PyBytes_AS_STRING(res);
6352 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006353 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006354 if (_PyBytes_Resize(&res, size) < 0)
6355 goto onError;
6356 }
6357
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 Py_XDECREF(errorHandler);
6359 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006360 return res;
6361
6362 onError:
6363 Py_XDECREF(res);
6364 Py_XDECREF(errorHandler);
6365 Py_XDECREF(exc);
6366 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367}
6368
Alexander Belopolsky40018472011-02-26 01:02:56 +00006369PyObject *
6370PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006371 Py_ssize_t size,
6372 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375}
6376
Alexander Belopolsky40018472011-02-26 01:02:56 +00006377PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006378_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379{
6380 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 PyErr_BadArgument();
6382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006384 if (PyUnicode_READY(unicode) == -1)
6385 return NULL;
6386 /* Fast path: if it is a one-byte string, construct
6387 bytes object directly. */
6388 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6389 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6390 PyUnicode_GET_LENGTH(unicode));
6391 /* Non-Latin-1 characters present. Defer to above function to
6392 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006395 errors);
6396}
6397
6398PyObject*
6399PyUnicode_AsLatin1String(PyObject *unicode)
6400{
6401 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402}
6403
6404/* --- 7-bit ASCII Codec -------------------------------------------------- */
6405
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406PyObject *
6407PyUnicode_DecodeASCII(const char *s,
6408 Py_ssize_t size,
6409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 PyUnicodeObject *v;
6413 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006414 Py_ssize_t startinpos;
6415 Py_ssize_t endinpos;
6416 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 PyObject *errorHandler = NULL;
6420 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006421 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006422
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006424 if (size == 1 && *(unsigned char*)s < 128)
6425 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6426
6427 /* Fast path. Assume the input actually *is* ASCII, and allocate
6428 a single-block Unicode object with that assumption. If there is
6429 an error, drop the object and start over. */
6430 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6431 if (v == NULL)
6432 goto onError;
6433 d = PyUnicode_1BYTE_DATA(v);
6434 for (i = 0; i < size; i++) {
6435 unsigned char ch = ((unsigned char*)s)[i];
6436 if (ch < 128)
6437 d[i] = ch;
6438 else
6439 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006441 if (i == size)
6442 return (PyObject*)v;
6443 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006444
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 v = _PyUnicode_New(size);
6446 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 e = s + size;
6452 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 register unsigned char c = (unsigned char)*s;
6454 if (c < 128) {
6455 *p++ = c;
6456 ++s;
6457 }
6458 else {
6459 startinpos = s-starts;
6460 endinpos = startinpos + 1;
6461 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6462 if (unicode_decode_call_errorhandler(
6463 errors, &errorHandler,
6464 "ascii", "ordinal not in range(128)",
6465 &starts, &e, &startinpos, &endinpos, &exc, &s,
6466 &v, &outpos, &p))
6467 goto onError;
6468 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006470 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006471 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473 Py_XDECREF(errorHandler);
6474 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006475 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006476 Py_DECREF(v);
6477 return NULL;
6478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 Py_XDECREF(errorHandler);
6484 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return NULL;
6486}
6487
Alexander Belopolsky40018472011-02-26 01:02:56 +00006488PyObject *
6489PyUnicode_EncodeASCII(const Py_UNICODE *p,
6490 Py_ssize_t size,
6491 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
Alexander Belopolsky40018472011-02-26 01:02:56 +00006496PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498{
6499 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 PyErr_BadArgument();
6501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006503 if (PyUnicode_READY(unicode) == -1)
6504 return NULL;
6505 /* Fast path: if it is an ASCII-only string, construct bytes object
6506 directly. Else defer to above function to raise the exception. */
6507 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6508 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6509 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006512 errors);
6513}
6514
6515PyObject *
6516PyUnicode_AsASCIIString(PyObject *unicode)
6517{
6518 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519}
6520
Victor Stinner99b95382011-07-04 14:23:54 +02006521#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006522
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006523/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006524
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006525#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006526#define NEED_RETRY
6527#endif
6528
6529/* XXX This code is limited to "true" double-byte encodings, as
6530 a) it assumes an incomplete character consists of a single byte, and
6531 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533
Alexander Belopolsky40018472011-02-26 01:02:56 +00006534static int
6535is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536{
6537 const char *curr = s + offset;
6538
6539 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 const char *prev = CharPrev(s, curr);
6541 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006542 }
6543 return 0;
6544}
6545
6546/*
6547 * Decode MBCS string into unicode object. If 'final' is set, converts
6548 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6549 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006550static int
6551decode_mbcs(PyUnicodeObject **v,
6552 const char *s, /* MBCS string */
6553 int size, /* sizeof MBCS string */
6554 int final,
6555 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006556{
6557 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006558 Py_ssize_t n;
6559 DWORD usize;
6560 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006561
6562 assert(size >= 0);
6563
Victor Stinner554f3f02010-06-16 23:33:54 +00006564 /* check and handle 'errors' arg */
6565 if (errors==NULL || strcmp(errors, "strict")==0)
6566 flags = MB_ERR_INVALID_CHARS;
6567 else if (strcmp(errors, "ignore")==0)
6568 flags = 0;
6569 else {
6570 PyErr_Format(PyExc_ValueError,
6571 "mbcs encoding does not support errors='%s'",
6572 errors);
6573 return -1;
6574 }
6575
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576 /* Skip trailing lead-byte unless 'final' is set */
6577 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579
6580 /* First get the size of the result */
6581 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006582 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6583 if (usize==0)
6584 goto mbcs_decode_error;
6585 } else
6586 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006587
6588 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 /* Create unicode object */
6590 *v = _PyUnicode_New(usize);
6591 if (*v == NULL)
6592 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006593 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 }
6595 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 /* Extend unicode object */
6597 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006598 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006600 }
6601
6602 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006603 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006605 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6606 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006608 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006609 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006610
6611mbcs_decode_error:
6612 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6613 we raise a UnicodeDecodeError - else it is a 'generic'
6614 windows error
6615 */
6616 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6617 /* Ideally, we should get reason from FormatMessage - this
6618 is the Windows 2000 English version of the message
6619 */
6620 PyObject *exc = NULL;
6621 const char *reason = "No mapping for the Unicode character exists "
6622 "in the target multi-byte code page.";
6623 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6624 if (exc != NULL) {
6625 PyCodec_StrictErrors(exc);
6626 Py_DECREF(exc);
6627 }
6628 } else {
6629 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6630 }
6631 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006632}
6633
Alexander Belopolsky40018472011-02-26 01:02:56 +00006634PyObject *
6635PyUnicode_DecodeMBCSStateful(const char *s,
6636 Py_ssize_t size,
6637 const char *errors,
6638 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006639{
6640 PyUnicodeObject *v = NULL;
6641 int done;
6642
6643 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006645
6646#ifdef NEED_RETRY
6647 retry:
6648 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006649 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006650 else
6651#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006652 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006653
6654 if (done < 0) {
6655 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006657 }
6658
6659 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661
6662#ifdef NEED_RETRY
6663 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 s += done;
6665 size -= done;
6666 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667 }
6668#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006669 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670 Py_DECREF(v);
6671 return NULL;
6672 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006673 return (PyObject *)v;
6674}
6675
Alexander Belopolsky40018472011-02-26 01:02:56 +00006676PyObject *
6677PyUnicode_DecodeMBCS(const char *s,
6678 Py_ssize_t size,
6679 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006680{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006681 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6682}
6683
6684/*
6685 * Convert unicode into string object (MBCS).
6686 * Returns 0 if succeed, -1 otherwise.
6687 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006688static int
6689encode_mbcs(PyObject **repr,
6690 const Py_UNICODE *p, /* unicode */
6691 int size, /* size of unicode */
6692 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006693{
Victor Stinner554f3f02010-06-16 23:33:54 +00006694 BOOL usedDefaultChar = FALSE;
6695 BOOL *pusedDefaultChar;
6696 int mbcssize;
6697 Py_ssize_t n;
6698 PyObject *exc = NULL;
6699 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700
6701 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006702
Victor Stinner554f3f02010-06-16 23:33:54 +00006703 /* check and handle 'errors' arg */
6704 if (errors==NULL || strcmp(errors, "strict")==0) {
6705 flags = WC_NO_BEST_FIT_CHARS;
6706 pusedDefaultChar = &usedDefaultChar;
6707 } else if (strcmp(errors, "replace")==0) {
6708 flags = 0;
6709 pusedDefaultChar = NULL;
6710 } else {
6711 PyErr_Format(PyExc_ValueError,
6712 "mbcs encoding does not support errors='%s'",
6713 errors);
6714 return -1;
6715 }
6716
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006717 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006719 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6720 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 if (mbcssize == 0) {
6722 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6723 return -1;
6724 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006725 /* If we used a default char, then we failed! */
6726 if (pusedDefaultChar && *pusedDefaultChar)
6727 goto mbcs_encode_error;
6728 } else {
6729 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006730 }
6731
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 /* Create string object */
6734 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6735 if (*repr == NULL)
6736 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006737 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006738 }
6739 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 /* Extend string object */
6741 n = PyBytes_Size(*repr);
6742 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6743 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006744 }
6745
6746 /* Do the conversion */
6747 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006749 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6750 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6752 return -1;
6753 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006754 if (pusedDefaultChar && *pusedDefaultChar)
6755 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006757 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006758
6759mbcs_encode_error:
6760 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6761 Py_XDECREF(exc);
6762 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006763}
6764
Alexander Belopolsky40018472011-02-26 01:02:56 +00006765PyObject *
6766PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6767 Py_ssize_t size,
6768 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006769{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770 PyObject *repr = NULL;
6771 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006772
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006773#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006775 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006776 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777 else
6778#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006780
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 Py_XDECREF(repr);
6783 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006784 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785
6786#ifdef NEED_RETRY
6787 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 p += INT_MAX;
6789 size -= INT_MAX;
6790 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791 }
6792#endif
6793
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006794 return repr;
6795}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006796
Alexander Belopolsky40018472011-02-26 01:02:56 +00006797PyObject *
6798PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006799{
6800 if (!PyUnicode_Check(unicode)) {
6801 PyErr_BadArgument();
6802 return NULL;
6803 }
6804 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 PyUnicode_GET_SIZE(unicode),
6806 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006807}
6808
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006809#undef NEED_RETRY
6810
Victor Stinner99b95382011-07-04 14:23:54 +02006811#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813/* --- Character Mapping Codec -------------------------------------------- */
6814
Alexander Belopolsky40018472011-02-26 01:02:56 +00006815PyObject *
6816PyUnicode_DecodeCharmap(const char *s,
6817 Py_ssize_t size,
6818 PyObject *mapping,
6819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006821 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006822 Py_ssize_t startinpos;
6823 Py_ssize_t endinpos;
6824 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006825 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 PyUnicodeObject *v;
6827 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006828 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829 PyObject *errorHandler = NULL;
6830 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006831 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006832 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006833
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 /* Default to Latin-1 */
6835 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837
6838 v = _PyUnicode_New(size);
6839 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006845 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 mapstring = PyUnicode_AS_UNICODE(mapping);
6847 maplen = PyUnicode_GET_SIZE(mapping);
6848 while (s < e) {
6849 unsigned char ch = *s;
6850 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 if (ch < maplen)
6853 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 if (x == 0xfffe) {
6856 /* undefined mapping */
6857 outpos = p-PyUnicode_AS_UNICODE(v);
6858 startinpos = s-starts;
6859 endinpos = startinpos+1;
6860 if (unicode_decode_call_errorhandler(
6861 errors, &errorHandler,
6862 "charmap", "character maps to <undefined>",
6863 &starts, &e, &startinpos, &endinpos, &exc, &s,
6864 &v, &outpos, &p)) {
6865 goto onError;
6866 }
6867 continue;
6868 }
6869 *p++ = x;
6870 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006871 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006872 }
6873 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 while (s < e) {
6875 unsigned char ch = *s;
6876 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006877
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6879 w = PyLong_FromLong((long)ch);
6880 if (w == NULL)
6881 goto onError;
6882 x = PyObject_GetItem(mapping, w);
6883 Py_DECREF(w);
6884 if (x == NULL) {
6885 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6886 /* No mapping found means: mapping is undefined. */
6887 PyErr_Clear();
6888 x = Py_None;
6889 Py_INCREF(x);
6890 } else
6891 goto onError;
6892 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006893
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 /* Apply mapping */
6895 if (PyLong_Check(x)) {
6896 long value = PyLong_AS_LONG(x);
6897 if (value < 0 || value > 65535) {
6898 PyErr_SetString(PyExc_TypeError,
6899 "character mapping must be in range(65536)");
6900 Py_DECREF(x);
6901 goto onError;
6902 }
6903 *p++ = (Py_UNICODE)value;
6904 }
6905 else if (x == Py_None) {
6906 /* undefined mapping */
6907 outpos = p-PyUnicode_AS_UNICODE(v);
6908 startinpos = s-starts;
6909 endinpos = startinpos+1;
6910 if (unicode_decode_call_errorhandler(
6911 errors, &errorHandler,
6912 "charmap", "character maps to <undefined>",
6913 &starts, &e, &startinpos, &endinpos, &exc, &s,
6914 &v, &outpos, &p)) {
6915 Py_DECREF(x);
6916 goto onError;
6917 }
6918 Py_DECREF(x);
6919 continue;
6920 }
6921 else if (PyUnicode_Check(x)) {
6922 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 if (targetsize == 1)
6925 /* 1-1 mapping */
6926 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 else if (targetsize > 1) {
6929 /* 1-n mapping */
6930 if (targetsize > extrachars) {
6931 /* resize first */
6932 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6933 Py_ssize_t needed = (targetsize - extrachars) + \
6934 (targetsize << 2);
6935 extrachars += needed;
6936 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006937 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 PyUnicode_GET_SIZE(v) + needed) < 0) {
6939 Py_DECREF(x);
6940 goto onError;
6941 }
6942 p = PyUnicode_AS_UNICODE(v) + oldpos;
6943 }
6944 Py_UNICODE_COPY(p,
6945 PyUnicode_AS_UNICODE(x),
6946 targetsize);
6947 p += targetsize;
6948 extrachars -= targetsize;
6949 }
6950 /* 1-0 mapping: skip the character */
6951 }
6952 else {
6953 /* wrong return value */
6954 PyErr_SetString(PyExc_TypeError,
6955 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006956 Py_DECREF(x);
6957 goto onError;
6958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 Py_DECREF(x);
6960 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
6963 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006964 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006966 Py_XDECREF(errorHandler);
6967 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006968 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006969 Py_DECREF(v);
6970 return NULL;
6971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006973
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006975 Py_XDECREF(errorHandler);
6976 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006977 Py_XDECREF(v);
6978 return NULL;
6979}
6980
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006981/* Charmap encoding: the lookup table */
6982
Alexander Belopolsky40018472011-02-26 01:02:56 +00006983struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 PyObject_HEAD
6985 unsigned char level1[32];
6986 int count2, count3;
6987 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006988};
6989
6990static PyObject*
6991encoding_map_size(PyObject *obj, PyObject* args)
6992{
6993 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006996}
6997
6998static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006999 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007000 PyDoc_STR("Return the size (in bytes) of this object") },
7001 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007002};
7003
7004static void
7005encoding_map_dealloc(PyObject* o)
7006{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007007 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007008}
7009
7010static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007011 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 "EncodingMap", /*tp_name*/
7013 sizeof(struct encoding_map), /*tp_basicsize*/
7014 0, /*tp_itemsize*/
7015 /* methods */
7016 encoding_map_dealloc, /*tp_dealloc*/
7017 0, /*tp_print*/
7018 0, /*tp_getattr*/
7019 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007020 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 0, /*tp_repr*/
7022 0, /*tp_as_number*/
7023 0, /*tp_as_sequence*/
7024 0, /*tp_as_mapping*/
7025 0, /*tp_hash*/
7026 0, /*tp_call*/
7027 0, /*tp_str*/
7028 0, /*tp_getattro*/
7029 0, /*tp_setattro*/
7030 0, /*tp_as_buffer*/
7031 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7032 0, /*tp_doc*/
7033 0, /*tp_traverse*/
7034 0, /*tp_clear*/
7035 0, /*tp_richcompare*/
7036 0, /*tp_weaklistoffset*/
7037 0, /*tp_iter*/
7038 0, /*tp_iternext*/
7039 encoding_map_methods, /*tp_methods*/
7040 0, /*tp_members*/
7041 0, /*tp_getset*/
7042 0, /*tp_base*/
7043 0, /*tp_dict*/
7044 0, /*tp_descr_get*/
7045 0, /*tp_descr_set*/
7046 0, /*tp_dictoffset*/
7047 0, /*tp_init*/
7048 0, /*tp_alloc*/
7049 0, /*tp_new*/
7050 0, /*tp_free*/
7051 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007052};
7053
7054PyObject*
7055PyUnicode_BuildEncodingMap(PyObject* string)
7056{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007057 PyObject *result;
7058 struct encoding_map *mresult;
7059 int i;
7060 int need_dict = 0;
7061 unsigned char level1[32];
7062 unsigned char level2[512];
7063 unsigned char *mlevel1, *mlevel2, *mlevel3;
7064 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007065 int kind;
7066 void *data;
7067 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007069 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007070 PyErr_BadArgument();
7071 return NULL;
7072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007073 kind = PyUnicode_KIND(string);
7074 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007075 memset(level1, 0xFF, sizeof level1);
7076 memset(level2, 0xFF, sizeof level2);
7077
7078 /* If there isn't a one-to-one mapping of NULL to \0,
7079 or if there are non-BMP characters, we need to use
7080 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007081 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007082 need_dict = 1;
7083 for (i = 1; i < 256; i++) {
7084 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007085 ch = PyUnicode_READ(kind, data, i);
7086 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007087 need_dict = 1;
7088 break;
7089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007090 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007091 /* unmapped character */
7092 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007093 l1 = ch >> 11;
7094 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007095 if (level1[l1] == 0xFF)
7096 level1[l1] = count2++;
7097 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007098 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007099 }
7100
7101 if (count2 >= 0xFF || count3 >= 0xFF)
7102 need_dict = 1;
7103
7104 if (need_dict) {
7105 PyObject *result = PyDict_New();
7106 PyObject *key, *value;
7107 if (!result)
7108 return NULL;
7109 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007110 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007111 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007112 if (!key || !value)
7113 goto failed1;
7114 if (PyDict_SetItem(result, key, value) == -1)
7115 goto failed1;
7116 Py_DECREF(key);
7117 Py_DECREF(value);
7118 }
7119 return result;
7120 failed1:
7121 Py_XDECREF(key);
7122 Py_XDECREF(value);
7123 Py_DECREF(result);
7124 return NULL;
7125 }
7126
7127 /* Create a three-level trie */
7128 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7129 16*count2 + 128*count3 - 1);
7130 if (!result)
7131 return PyErr_NoMemory();
7132 PyObject_Init(result, &EncodingMapType);
7133 mresult = (struct encoding_map*)result;
7134 mresult->count2 = count2;
7135 mresult->count3 = count3;
7136 mlevel1 = mresult->level1;
7137 mlevel2 = mresult->level23;
7138 mlevel3 = mresult->level23 + 16*count2;
7139 memcpy(mlevel1, level1, 32);
7140 memset(mlevel2, 0xFF, 16*count2);
7141 memset(mlevel3, 0, 128*count3);
7142 count3 = 0;
7143 for (i = 1; i < 256; i++) {
7144 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007145 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007146 /* unmapped character */
7147 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007148 o1 = PyUnicode_READ(kind, data, i)>>11;
7149 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007150 i2 = 16*mlevel1[o1] + o2;
7151 if (mlevel2[i2] == 0xFF)
7152 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007153 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007154 i3 = 128*mlevel2[i2] + o3;
7155 mlevel3[i3] = i;
7156 }
7157 return result;
7158}
7159
7160static int
7161encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7162{
7163 struct encoding_map *map = (struct encoding_map*)mapping;
7164 int l1 = c>>11;
7165 int l2 = (c>>7) & 0xF;
7166 int l3 = c & 0x7F;
7167 int i;
7168
7169#ifdef Py_UNICODE_WIDE
7170 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007172 }
7173#endif
7174 if (c == 0)
7175 return 0;
7176 /* level 1*/
7177 i = map->level1[l1];
7178 if (i == 0xFF) {
7179 return -1;
7180 }
7181 /* level 2*/
7182 i = map->level23[16*i+l2];
7183 if (i == 0xFF) {
7184 return -1;
7185 }
7186 /* level 3 */
7187 i = map->level23[16*map->count2 + 128*i + l3];
7188 if (i == 0) {
7189 return -1;
7190 }
7191 return i;
7192}
7193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194/* Lookup the character ch in the mapping. If the character
7195 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007196 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007197static PyObject *
7198charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199{
Christian Heimes217cfd12007-12-02 14:31:20 +00007200 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201 PyObject *x;
7202
7203 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007205 x = PyObject_GetItem(mapping, w);
7206 Py_DECREF(w);
7207 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7209 /* No mapping found means: mapping is undefined. */
7210 PyErr_Clear();
7211 x = Py_None;
7212 Py_INCREF(x);
7213 return x;
7214 } else
7215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007217 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007219 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 long value = PyLong_AS_LONG(x);
7221 if (value < 0 || value > 255) {
7222 PyErr_SetString(PyExc_TypeError,
7223 "character mapping must be in range(256)");
7224 Py_DECREF(x);
7225 return NULL;
7226 }
7227 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007229 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 /* wrong return value */
7233 PyErr_Format(PyExc_TypeError,
7234 "character mapping must return integer, bytes or None, not %.400s",
7235 x->ob_type->tp_name);
7236 Py_DECREF(x);
7237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007238 }
7239}
7240
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007241static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007242charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007243{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007244 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7245 /* exponentially overallocate to minimize reallocations */
7246 if (requiredsize < 2*outsize)
7247 requiredsize = 2*outsize;
7248 if (_PyBytes_Resize(outobj, requiredsize))
7249 return -1;
7250 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007251}
7252
Benjamin Peterson14339b62009-01-31 16:36:08 +00007253typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007255} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007257 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 space is available. Return a new reference to the object that
7259 was put in the output buffer, or Py_None, if the mapping was undefined
7260 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007261 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007262static charmapencode_result
7263charmapencode_output(Py_UNICODE c, PyObject *mapping,
7264 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007266 PyObject *rep;
7267 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007268 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269
Christian Heimes90aa7642007-12-19 02:45:37 +00007270 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007273 if (res == -1)
7274 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 if (outsize<requiredsize)
7276 if (charmapencode_resize(outobj, outpos, requiredsize))
7277 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007278 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 outstart[(*outpos)++] = (char)res;
7280 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007281 }
7282
7283 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007285 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007286 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 Py_DECREF(rep);
7288 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007289 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007290 if (PyLong_Check(rep)) {
7291 Py_ssize_t requiredsize = *outpos+1;
7292 if (outsize<requiredsize)
7293 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7294 Py_DECREF(rep);
7295 return enc_EXCEPTION;
7296 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007297 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007298 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007299 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 else {
7301 const char *repchars = PyBytes_AS_STRING(rep);
7302 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7303 Py_ssize_t requiredsize = *outpos+repsize;
7304 if (outsize<requiredsize)
7305 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7306 Py_DECREF(rep);
7307 return enc_EXCEPTION;
7308 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007309 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 memcpy(outstart + *outpos, repchars, repsize);
7311 *outpos += repsize;
7312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007313 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007314 Py_DECREF(rep);
7315 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007316}
7317
7318/* handle an error in PyUnicode_EncodeCharmap
7319 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007320static int
7321charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007322 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007323 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007324 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007325 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007326{
7327 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007328 Py_ssize_t repsize;
7329 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007330 Py_UNICODE *uni2;
7331 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007332 Py_ssize_t collstartpos = *inpos;
7333 Py_ssize_t collendpos = *inpos+1;
7334 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007335 char *encoding = "charmap";
7336 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007337 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 /* find all unencodable characters */
7340 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007341 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007342 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 int res = encoding_map_lookup(p[collendpos], mapping);
7344 if (res != -1)
7345 break;
7346 ++collendpos;
7347 continue;
7348 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007349
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 rep = charmapencode_lookup(p[collendpos], mapping);
7351 if (rep==NULL)
7352 return -1;
7353 else if (rep!=Py_None) {
7354 Py_DECREF(rep);
7355 break;
7356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007357 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007359 }
7360 /* cache callback name lookup
7361 * (if not done yet, i.e. it's the first error) */
7362 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 if ((errors==NULL) || (!strcmp(errors, "strict")))
7364 *known_errorHandler = 1;
7365 else if (!strcmp(errors, "replace"))
7366 *known_errorHandler = 2;
7367 else if (!strcmp(errors, "ignore"))
7368 *known_errorHandler = 3;
7369 else if (!strcmp(errors, "xmlcharrefreplace"))
7370 *known_errorHandler = 4;
7371 else
7372 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 }
7374 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007375 case 1: /* strict */
7376 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7377 return -1;
7378 case 2: /* replace */
7379 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 x = charmapencode_output('?', mapping, res, respos);
7381 if (x==enc_EXCEPTION) {
7382 return -1;
7383 }
7384 else if (x==enc_FAILED) {
7385 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7386 return -1;
7387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007388 }
7389 /* fall through */
7390 case 3: /* ignore */
7391 *inpos = collendpos;
7392 break;
7393 case 4: /* xmlcharrefreplace */
7394 /* generate replacement (temporarily (mis)uses p) */
7395 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 char buffer[2+29+1+1];
7397 char *cp;
7398 sprintf(buffer, "&#%d;", (int)p[collpos]);
7399 for (cp = buffer; *cp; ++cp) {
7400 x = charmapencode_output(*cp, mapping, res, respos);
7401 if (x==enc_EXCEPTION)
7402 return -1;
7403 else if (x==enc_FAILED) {
7404 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7405 return -1;
7406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007407 }
7408 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007409 *inpos = collendpos;
7410 break;
7411 default:
7412 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 encoding, reason, p, size, exceptionObject,
7414 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007415 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007417 if (PyBytes_Check(repunicode)) {
7418 /* Directly copy bytes result to output. */
7419 Py_ssize_t outsize = PyBytes_Size(*res);
7420 Py_ssize_t requiredsize;
7421 repsize = PyBytes_Size(repunicode);
7422 requiredsize = *respos + repsize;
7423 if (requiredsize > outsize)
7424 /* Make room for all additional bytes. */
7425 if (charmapencode_resize(res, respos, requiredsize)) {
7426 Py_DECREF(repunicode);
7427 return -1;
7428 }
7429 memcpy(PyBytes_AsString(*res) + *respos,
7430 PyBytes_AsString(repunicode), repsize);
7431 *respos += repsize;
7432 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007433 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007434 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007435 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007436 /* generate replacement */
7437 repsize = PyUnicode_GET_SIZE(repunicode);
7438 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007439 x = charmapencode_output(*uni2, mapping, res, respos);
7440 if (x==enc_EXCEPTION) {
7441 return -1;
7442 }
7443 else if (x==enc_FAILED) {
7444 Py_DECREF(repunicode);
7445 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7446 return -1;
7447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007448 }
7449 *inpos = newpos;
7450 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451 }
7452 return 0;
7453}
7454
Alexander Belopolsky40018472011-02-26 01:02:56 +00007455PyObject *
7456PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7457 Py_ssize_t size,
7458 PyObject *mapping,
7459 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461 /* output object */
7462 PyObject *res = NULL;
7463 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007464 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007465 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007466 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007467 PyObject *errorHandler = NULL;
7468 PyObject *exc = NULL;
7469 /* the following variable is used for caching string comparisons
7470 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7471 * 3=ignore, 4=xmlcharrefreplace */
7472 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473
7474 /* Default to Latin-1 */
7475 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007478 /* allocate enough for a simple encoding without
7479 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007480 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007481 if (res == NULL)
7482 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007483 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 /* try to encode it */
7488 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7489 if (x==enc_EXCEPTION) /* error */
7490 goto onError;
7491 if (x==enc_FAILED) { /* unencodable character */
7492 if (charmap_encoding_error(p, size, &inpos, mapping,
7493 &exc,
7494 &known_errorHandler, &errorHandler, errors,
7495 &res, &respos)) {
7496 goto onError;
7497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 else
7500 /* done with this character => adjust input position */
7501 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007505 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007506 if (_PyBytes_Resize(&res, respos) < 0)
7507 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007509 Py_XDECREF(exc);
7510 Py_XDECREF(errorHandler);
7511 return res;
7512
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007514 Py_XDECREF(res);
7515 Py_XDECREF(exc);
7516 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 return NULL;
7518}
7519
Alexander Belopolsky40018472011-02-26 01:02:56 +00007520PyObject *
7521PyUnicode_AsCharmapString(PyObject *unicode,
7522 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523{
7524 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 PyErr_BadArgument();
7526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527 }
7528 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 PyUnicode_GET_SIZE(unicode),
7530 mapping,
7531 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532}
7533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007535static void
7536make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007537 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007538 Py_ssize_t startpos, Py_ssize_t endpos,
7539 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007541 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007542 *exceptionObject = _PyUnicodeTranslateError_Create(
7543 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544 }
7545 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7547 goto onError;
7548 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7549 goto onError;
7550 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7551 goto onError;
7552 return;
7553 onError:
7554 Py_DECREF(*exceptionObject);
7555 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 }
7557}
7558
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007560static void
7561raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007562 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007563 Py_ssize_t startpos, Py_ssize_t endpos,
7564 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007565{
7566 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007568 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007570}
7571
7572/* error handling callback helper:
7573 build arguments, call the callback and check the arguments,
7574 put the result into newpos and return the replacement string, which
7575 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007576static PyObject *
7577unicode_translate_call_errorhandler(const char *errors,
7578 PyObject **errorHandler,
7579 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007580 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007581 Py_ssize_t startpos, Py_ssize_t endpos,
7582 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007584 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007585
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007586 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 PyObject *restuple;
7588 PyObject *resunicode;
7589
7590 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 }
7595
7596 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007597 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007600
7601 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007605 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007606 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 Py_DECREF(restuple);
7608 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007609 }
7610 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 &resunicode, &i_newpos)) {
7612 Py_DECREF(restuple);
7613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007615 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007616 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007617 else
7618 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007619 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7621 Py_DECREF(restuple);
7622 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007623 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007624 Py_INCREF(resunicode);
7625 Py_DECREF(restuple);
7626 return resunicode;
7627}
7628
7629/* Lookup the character ch in the mapping and put the result in result,
7630 which must be decrefed by the caller.
7631 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007632static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007633charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634{
Christian Heimes217cfd12007-12-02 14:31:20 +00007635 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007636 PyObject *x;
7637
7638 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640 x = PyObject_GetItem(mapping, w);
7641 Py_DECREF(w);
7642 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7644 /* No mapping found means: use 1:1 mapping. */
7645 PyErr_Clear();
7646 *result = NULL;
7647 return 0;
7648 } else
7649 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 }
7651 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 *result = x;
7653 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007654 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007655 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 long value = PyLong_AS_LONG(x);
7657 long max = PyUnicode_GetMax();
7658 if (value < 0 || value > max) {
7659 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007660 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 Py_DECREF(x);
7662 return -1;
7663 }
7664 *result = x;
7665 return 0;
7666 }
7667 else if (PyUnicode_Check(x)) {
7668 *result = x;
7669 return 0;
7670 }
7671 else {
7672 /* wrong return value */
7673 PyErr_SetString(PyExc_TypeError,
7674 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 Py_DECREF(x);
7676 return -1;
7677 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678}
7679/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 if not reallocate and adjust various state variables.
7681 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007682static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007683charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007686 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007687 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 /* exponentially overallocate to minimize reallocations */
7689 if (requiredsize < 2 * oldsize)
7690 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007691 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7692 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007694 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 }
7696 return 0;
7697}
7698/* lookup the character, put the result in the output string and adjust
7699 various state variables. Return a new reference to the object that
7700 was put in the output buffer in *result, or Py_None, if the mapping was
7701 undefined (in which case no character was written).
7702 The called must decref result.
7703 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007704static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007705charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7706 PyObject *mapping, Py_UCS4 **output,
7707 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007708 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007710 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7711 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716 }
7717 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007719 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007721 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 }
7723 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 Py_ssize_t repsize;
7725 if (PyUnicode_READY(*res) == -1)
7726 return -1;
7727 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 if (repsize==1) {
7729 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 }
7732 else if (repsize!=0) {
7733 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 Py_ssize_t requiredsize = *opos +
7735 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007737 Py_ssize_t i;
7738 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007740 for(i = 0; i < repsize; i++)
7741 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743 }
7744 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746 return 0;
7747}
7748
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750_PyUnicode_TranslateCharmap(PyObject *input,
7751 PyObject *mapping,
7752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007754 /* input object */
7755 char *idata;
7756 Py_ssize_t size, i;
7757 int kind;
7758 /* output buffer */
7759 Py_UCS4 *output = NULL;
7760 Py_ssize_t osize;
7761 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007762 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007763 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007764 char *reason = "character maps to <undefined>";
7765 PyObject *errorHandler = NULL;
7766 PyObject *exc = NULL;
7767 /* the following variable is used for caching string comparisons
7768 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7769 * 3=ignore, 4=xmlcharrefreplace */
7770 int known_errorHandler = -1;
7771
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 PyErr_BadArgument();
7774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007777 if (PyUnicode_READY(input) == -1)
7778 return NULL;
7779 idata = (char*)PyUnicode_DATA(input);
7780 kind = PyUnicode_KIND(input);
7781 size = PyUnicode_GET_LENGTH(input);
7782 i = 0;
7783
7784 if (size == 0) {
7785 Py_INCREF(input);
7786 return input;
7787 }
7788
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789 /* allocate enough for a simple 1:1 translation without
7790 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 osize = size;
7792 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7793 opos = 0;
7794 if (output == NULL) {
7795 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 /* try to encode it */
7801 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 if (charmaptranslate_output(input, i, mapping,
7803 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 Py_XDECREF(x);
7805 goto onError;
7806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007807 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 else { /* untranslatable character */
7811 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7812 Py_ssize_t repsize;
7813 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816 Py_ssize_t collstart = i;
7817 Py_ssize_t collend = i+1;
7818 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 while (collend < size) {
7822 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 goto onError;
7824 Py_XDECREF(x);
7825 if (x!=Py_None)
7826 break;
7827 ++collend;
7828 }
7829 /* cache callback name lookup
7830 * (if not done yet, i.e. it's the first error) */
7831 if (known_errorHandler==-1) {
7832 if ((errors==NULL) || (!strcmp(errors, "strict")))
7833 known_errorHandler = 1;
7834 else if (!strcmp(errors, "replace"))
7835 known_errorHandler = 2;
7836 else if (!strcmp(errors, "ignore"))
7837 known_errorHandler = 3;
7838 else if (!strcmp(errors, "xmlcharrefreplace"))
7839 known_errorHandler = 4;
7840 else
7841 known_errorHandler = 0;
7842 }
7843 switch (known_errorHandler) {
7844 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 raise_translate_exception(&exc, input, collstart,
7846 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 case 2: /* replace */
7849 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850 for (coll = collstart; coll<collend; coll++)
7851 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* fall through */
7853 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007854 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 break;
7856 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007857 /* generate replacement (temporarily (mis)uses i) */
7858 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 char buffer[2+29+1+1];
7860 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7862 if (charmaptranslate_makespace(&output, &osize,
7863 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 goto onError;
7865 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 break;
7870 default:
7871 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 reason, input, &exc,
7873 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007874 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 goto onError;
7876 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 repsize = PyUnicode_GET_LENGTH(repunicode);
7878 if (charmaptranslate_makespace(&output, &osize,
7879 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 Py_DECREF(repunicode);
7881 goto onError;
7882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 for (uni2 = 0; repsize-->0; ++uni2)
7884 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7885 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 }
7889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7891 if (!res)
7892 goto onError;
7893 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 Py_XDECREF(exc);
7895 Py_XDECREF(errorHandler);
7896 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007899 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007900 Py_XDECREF(exc);
7901 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 return NULL;
7903}
7904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007905/* Deprecated. Use PyUnicode_Translate instead. */
7906PyObject *
7907PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7908 Py_ssize_t size,
7909 PyObject *mapping,
7910 const char *errors)
7911{
7912 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7913 if (!unicode)
7914 return NULL;
7915 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7916}
7917
Alexander Belopolsky40018472011-02-26 01:02:56 +00007918PyObject *
7919PyUnicode_Translate(PyObject *str,
7920 PyObject *mapping,
7921 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922{
7923 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007924
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925 str = PyUnicode_FromObject(str);
7926 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 Py_DECREF(str);
7930 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007931
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933 Py_XDECREF(str);
7934 return NULL;
7935}
Tim Petersced69f82003-09-16 20:30:58 +00007936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937static Py_UCS4
7938fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7939{
7940 /* No need to call PyUnicode_READY(self) because this function is only
7941 called as a callback from fixup() which does it already. */
7942 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7943 const int kind = PyUnicode_KIND(self);
7944 void *data = PyUnicode_DATA(self);
7945 Py_UCS4 maxchar = 0, ch, fixed;
7946 Py_ssize_t i;
7947
7948 for (i = 0; i < len; ++i) {
7949 ch = PyUnicode_READ(kind, data, i);
7950 fixed = 0;
7951 if (ch > 127) {
7952 if (Py_UNICODE_ISSPACE(ch))
7953 fixed = ' ';
7954 else {
7955 const int decimal = Py_UNICODE_TODECIMAL(ch);
7956 if (decimal >= 0)
7957 fixed = '0' + decimal;
7958 }
7959 if (fixed != 0) {
7960 if (fixed > maxchar)
7961 maxchar = fixed;
7962 PyUnicode_WRITE(kind, data, i, fixed);
7963 }
7964 else if (ch > maxchar)
7965 maxchar = ch;
7966 }
7967 else if (ch > maxchar)
7968 maxchar = ch;
7969 }
7970
7971 return maxchar;
7972}
7973
7974PyObject *
7975_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7976{
7977 if (!PyUnicode_Check(unicode)) {
7978 PyErr_BadInternalCall();
7979 return NULL;
7980 }
7981 if (PyUnicode_READY(unicode) == -1)
7982 return NULL;
7983 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7984 /* If the string is already ASCII, just return the same string */
7985 Py_INCREF(unicode);
7986 return unicode;
7987 }
7988 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7989}
7990
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007991PyObject *
7992PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7993 Py_ssize_t length)
7994{
7995 PyObject *result;
7996 Py_UNICODE *p; /* write pointer into result */
7997 Py_ssize_t i;
7998 /* Copy to a new string */
7999 result = (PyObject *)_PyUnicode_New(length);
8000 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8001 if (result == NULL)
8002 return result;
8003 p = PyUnicode_AS_UNICODE(result);
8004 /* Iterate over code points */
8005 for (i = 0; i < length; i++) {
8006 Py_UNICODE ch =s[i];
8007 if (ch > 127) {
8008 int decimal = Py_UNICODE_TODECIMAL(ch);
8009 if (decimal >= 0)
8010 p[i] = '0' + decimal;
8011 }
8012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008013 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8014 Py_DECREF(result);
8015 return NULL;
8016 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008017 return result;
8018}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008019/* --- Decimal Encoder ---------------------------------------------------- */
8020
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021int
8022PyUnicode_EncodeDecimal(Py_UNICODE *s,
8023 Py_ssize_t length,
8024 char *output,
8025 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008026{
8027 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028 PyObject *errorHandler = NULL;
8029 PyObject *exc = NULL;
8030 const char *encoding = "decimal";
8031 const char *reason = "invalid decimal Unicode string";
8032 /* the following variable is used for caching string comparisons
8033 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8034 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008035
8036 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 PyErr_BadArgument();
8038 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008039 }
8040
8041 p = s;
8042 end = s + length;
8043 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 register Py_UNICODE ch = *p;
8045 int decimal;
8046 PyObject *repunicode;
8047 Py_ssize_t repsize;
8048 Py_ssize_t newpos;
8049 Py_UNICODE *uni2;
8050 Py_UNICODE *collstart;
8051 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008052
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 ++p;
8056 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 decimal = Py_UNICODE_TODECIMAL(ch);
8059 if (decimal >= 0) {
8060 *output++ = '0' + decimal;
8061 ++p;
8062 continue;
8063 }
8064 if (0 < ch && ch < 256) {
8065 *output++ = (char)ch;
8066 ++p;
8067 continue;
8068 }
8069 /* All other characters are considered unencodable */
8070 collstart = p;
8071 collend = p+1;
8072 while (collend < end) {
8073 if ((0 < *collend && *collend < 256) ||
8074 !Py_UNICODE_ISSPACE(*collend) ||
8075 Py_UNICODE_TODECIMAL(*collend))
8076 break;
8077 }
8078 /* cache callback name lookup
8079 * (if not done yet, i.e. it's the first error) */
8080 if (known_errorHandler==-1) {
8081 if ((errors==NULL) || (!strcmp(errors, "strict")))
8082 known_errorHandler = 1;
8083 else if (!strcmp(errors, "replace"))
8084 known_errorHandler = 2;
8085 else if (!strcmp(errors, "ignore"))
8086 known_errorHandler = 3;
8087 else if (!strcmp(errors, "xmlcharrefreplace"))
8088 known_errorHandler = 4;
8089 else
8090 known_errorHandler = 0;
8091 }
8092 switch (known_errorHandler) {
8093 case 1: /* strict */
8094 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8095 goto onError;
8096 case 2: /* replace */
8097 for (p = collstart; p < collend; ++p)
8098 *output++ = '?';
8099 /* fall through */
8100 case 3: /* ignore */
8101 p = collend;
8102 break;
8103 case 4: /* xmlcharrefreplace */
8104 /* generate replacement (temporarily (mis)uses p) */
8105 for (p = collstart; p < collend; ++p)
8106 output += sprintf(output, "&#%d;", (int)*p);
8107 p = collend;
8108 break;
8109 default:
8110 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8111 encoding, reason, s, length, &exc,
8112 collstart-s, collend-s, &newpos);
8113 if (repunicode == NULL)
8114 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008115 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008116 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008117 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8118 Py_DECREF(repunicode);
8119 goto onError;
8120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 /* generate replacement */
8122 repsize = PyUnicode_GET_SIZE(repunicode);
8123 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8124 Py_UNICODE ch = *uni2;
8125 if (Py_UNICODE_ISSPACE(ch))
8126 *output++ = ' ';
8127 else {
8128 decimal = Py_UNICODE_TODECIMAL(ch);
8129 if (decimal >= 0)
8130 *output++ = '0' + decimal;
8131 else if (0 < ch && ch < 256)
8132 *output++ = (char)ch;
8133 else {
8134 Py_DECREF(repunicode);
8135 raise_encode_exception(&exc, encoding,
8136 s, length, collstart-s, collend-s, reason);
8137 goto onError;
8138 }
8139 }
8140 }
8141 p = s + newpos;
8142 Py_DECREF(repunicode);
8143 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008144 }
8145 /* 0-terminate the output string */
8146 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 Py_XDECREF(exc);
8148 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008149 return 0;
8150
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 Py_XDECREF(exc);
8153 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008154 return -1;
8155}
8156
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157/* --- Helpers ------------------------------------------------------------ */
8158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008159#include "stringlib/ucs1lib.h"
8160#include "stringlib/fastsearch.h"
8161#include "stringlib/partition.h"
8162#include "stringlib/split.h"
8163#include "stringlib/count.h"
8164#include "stringlib/find.h"
8165#include "stringlib/localeutil.h"
8166#include "stringlib/undef.h"
8167
8168#include "stringlib/ucs2lib.h"
8169#include "stringlib/fastsearch.h"
8170#include "stringlib/partition.h"
8171#include "stringlib/split.h"
8172#include "stringlib/count.h"
8173#include "stringlib/find.h"
8174#include "stringlib/localeutil.h"
8175#include "stringlib/undef.h"
8176
8177#include "stringlib/ucs4lib.h"
8178#include "stringlib/fastsearch.h"
8179#include "stringlib/partition.h"
8180#include "stringlib/split.h"
8181#include "stringlib/count.h"
8182#include "stringlib/find.h"
8183#include "stringlib/localeutil.h"
8184#include "stringlib/undef.h"
8185
8186static Py_ssize_t
8187any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8188 const Py_UCS1*, Py_ssize_t,
8189 Py_ssize_t, Py_ssize_t),
8190 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8191 const Py_UCS2*, Py_ssize_t,
8192 Py_ssize_t, Py_ssize_t),
8193 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8194 const Py_UCS4*, Py_ssize_t,
8195 Py_ssize_t, Py_ssize_t),
8196 PyObject* s1, PyObject* s2,
8197 Py_ssize_t start,
8198 Py_ssize_t end)
8199{
8200 int kind1, kind2, kind;
8201 void *buf1, *buf2;
8202 Py_ssize_t len1, len2, result;
8203
8204 kind1 = PyUnicode_KIND(s1);
8205 kind2 = PyUnicode_KIND(s2);
8206 kind = kind1 > kind2 ? kind1 : kind2;
8207 buf1 = PyUnicode_DATA(s1);
8208 buf2 = PyUnicode_DATA(s2);
8209 if (kind1 != kind)
8210 buf1 = _PyUnicode_AsKind(s1, kind);
8211 if (!buf1)
8212 return -2;
8213 if (kind2 != kind)
8214 buf2 = _PyUnicode_AsKind(s2, kind);
8215 if (!buf2) {
8216 if (kind1 != kind) PyMem_Free(buf1);
8217 return -2;
8218 }
8219 len1 = PyUnicode_GET_LENGTH(s1);
8220 len2 = PyUnicode_GET_LENGTH(s2);
8221
8222 switch(kind) {
8223 case PyUnicode_1BYTE_KIND:
8224 result = ucs1(buf1, len1, buf2, len2, start, end);
8225 break;
8226 case PyUnicode_2BYTE_KIND:
8227 result = ucs2(buf1, len1, buf2, len2, start, end);
8228 break;
8229 case PyUnicode_4BYTE_KIND:
8230 result = ucs4(buf1, len1, buf2, len2, start, end);
8231 break;
8232 default:
8233 assert(0); result = -2;
8234 }
8235
8236 if (kind1 != kind)
8237 PyMem_Free(buf1);
8238 if (kind2 != kind)
8239 PyMem_Free(buf2);
8240
8241 return result;
8242}
8243
8244Py_ssize_t
8245_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8246 Py_ssize_t n_buffer,
8247 void *digits, Py_ssize_t n_digits,
8248 Py_ssize_t min_width,
8249 const char *grouping,
8250 const char *thousands_sep)
8251{
8252 switch(kind) {
8253 case PyUnicode_1BYTE_KIND:
8254 return _PyUnicode_ucs1_InsertThousandsGrouping(
8255 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8256 min_width, grouping, thousands_sep);
8257 case PyUnicode_2BYTE_KIND:
8258 return _PyUnicode_ucs2_InsertThousandsGrouping(
8259 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8260 min_width, grouping, thousands_sep);
8261 case PyUnicode_4BYTE_KIND:
8262 return _PyUnicode_ucs4_InsertThousandsGrouping(
8263 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8264 min_width, grouping, thousands_sep);
8265 }
8266 assert(0);
8267 return -1;
8268}
8269
8270
Eric Smith8c663262007-08-25 02:26:07 +00008271#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008272#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008273
Thomas Wouters477c8d52006-05-27 19:21:47 +00008274#include "stringlib/count.h"
8275#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008276
Thomas Wouters477c8d52006-05-27 19:21:47 +00008277/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008278#define ADJUST_INDICES(start, end, len) \
8279 if (end > len) \
8280 end = len; \
8281 else if (end < 0) { \
8282 end += len; \
8283 if (end < 0) \
8284 end = 0; \
8285 } \
8286 if (start < 0) { \
8287 start += len; \
8288 if (start < 0) \
8289 start = 0; \
8290 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008291
Alexander Belopolsky40018472011-02-26 01:02:56 +00008292Py_ssize_t
8293PyUnicode_Count(PyObject *str,
8294 PyObject *substr,
8295 Py_ssize_t start,
8296 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008298 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008299 PyUnicodeObject* str_obj;
8300 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008301 int kind1, kind2, kind;
8302 void *buf1 = NULL, *buf2 = NULL;
8303 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008304
Thomas Wouters477c8d52006-05-27 19:21:47 +00008305 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008308 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008309 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 Py_DECREF(str_obj);
8311 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 }
Tim Petersced69f82003-09-16 20:30:58 +00008313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 kind1 = PyUnicode_KIND(str_obj);
8315 kind2 = PyUnicode_KIND(sub_obj);
8316 kind = kind1 > kind2 ? kind1 : kind2;
8317 buf1 = PyUnicode_DATA(str_obj);
8318 if (kind1 != kind)
8319 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8320 if (!buf1)
8321 goto onError;
8322 buf2 = PyUnicode_DATA(sub_obj);
8323 if (kind2 != kind)
8324 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8325 if (!buf2)
8326 goto onError;
8327 len1 = PyUnicode_GET_LENGTH(str_obj);
8328 len2 = PyUnicode_GET_LENGTH(sub_obj);
8329
8330 ADJUST_INDICES(start, end, len1);
8331 switch(kind) {
8332 case PyUnicode_1BYTE_KIND:
8333 result = ucs1lib_count(
8334 ((Py_UCS1*)buf1) + start, end - start,
8335 buf2, len2, PY_SSIZE_T_MAX
8336 );
8337 break;
8338 case PyUnicode_2BYTE_KIND:
8339 result = ucs2lib_count(
8340 ((Py_UCS2*)buf1) + start, end - start,
8341 buf2, len2, PY_SSIZE_T_MAX
8342 );
8343 break;
8344 case PyUnicode_4BYTE_KIND:
8345 result = ucs4lib_count(
8346 ((Py_UCS4*)buf1) + start, end - start,
8347 buf2, len2, PY_SSIZE_T_MAX
8348 );
8349 break;
8350 default:
8351 assert(0); result = 0;
8352 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008353
8354 Py_DECREF(sub_obj);
8355 Py_DECREF(str_obj);
8356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 if (kind1 != kind)
8358 PyMem_Free(buf1);
8359 if (kind2 != kind)
8360 PyMem_Free(buf2);
8361
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 onError:
8364 Py_DECREF(sub_obj);
8365 Py_DECREF(str_obj);
8366 if (kind1 != kind && buf1)
8367 PyMem_Free(buf1);
8368 if (kind2 != kind && buf2)
8369 PyMem_Free(buf2);
8370 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371}
8372
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373Py_ssize_t
8374PyUnicode_Find(PyObject *str,
8375 PyObject *sub,
8376 Py_ssize_t start,
8377 Py_ssize_t end,
8378 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008381
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 Py_DECREF(str);
8388 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 }
Tim Petersced69f82003-09-16 20:30:58 +00008390
Thomas Wouters477c8d52006-05-27 19:21:47 +00008391 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 result = any_find_slice(
8393 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8394 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008395 );
8396 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 result = any_find_slice(
8398 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8399 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008400 );
8401
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008403 Py_DECREF(sub);
8404
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 return result;
8406}
8407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408Py_ssize_t
8409PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8410 Py_ssize_t start, Py_ssize_t end,
8411 int direction)
8412{
8413 char *result;
8414 int kind;
8415 if (PyUnicode_READY(str) == -1)
8416 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008417 if (start < 0 || end < 0) {
8418 PyErr_SetString(PyExc_IndexError, "string index out of range");
8419 return -2;
8420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 if (end > PyUnicode_GET_LENGTH(str))
8422 end = PyUnicode_GET_LENGTH(str);
8423 kind = PyUnicode_KIND(str);
8424 result = findchar(PyUnicode_1BYTE_DATA(str)
8425 + PyUnicode_KIND_SIZE(kind, start),
8426 kind,
8427 end-start, ch, direction);
8428 if (!result)
8429 return -1;
8430 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8431}
8432
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433static int
8434tailmatch(PyUnicodeObject *self,
8435 PyUnicodeObject *substring,
8436 Py_ssize_t start,
8437 Py_ssize_t end,
8438 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 int kind_self;
8441 int kind_sub;
8442 void *data_self;
8443 void *data_sub;
8444 Py_ssize_t offset;
8445 Py_ssize_t i;
8446 Py_ssize_t end_sub;
8447
8448 if (PyUnicode_READY(self) == -1 ||
8449 PyUnicode_READY(substring) == -1)
8450 return 0;
8451
8452 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 return 1;
8454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8456 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 kind_self = PyUnicode_KIND(self);
8461 data_self = PyUnicode_DATA(self);
8462 kind_sub = PyUnicode_KIND(substring);
8463 data_sub = PyUnicode_DATA(substring);
8464 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8465
8466 if (direction > 0)
8467 offset = end;
8468 else
8469 offset = start;
8470
8471 if (PyUnicode_READ(kind_self, data_self, offset) ==
8472 PyUnicode_READ(kind_sub, data_sub, 0) &&
8473 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8474 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8475 /* If both are of the same kind, memcmp is sufficient */
8476 if (kind_self == kind_sub) {
8477 return ! memcmp((char *)data_self +
8478 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8479 data_sub,
8480 PyUnicode_GET_LENGTH(substring) *
8481 PyUnicode_CHARACTER_SIZE(substring));
8482 }
8483 /* otherwise we have to compare each character by first accesing it */
8484 else {
8485 /* We do not need to compare 0 and len(substring)-1 because
8486 the if statement above ensured already that they are equal
8487 when we end up here. */
8488 // TODO: honor direction and do a forward or backwards search
8489 for (i = 1; i < end_sub; ++i) {
8490 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8491 PyUnicode_READ(kind_sub, data_sub, i))
8492 return 0;
8493 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 }
8497
8498 return 0;
8499}
8500
Alexander Belopolsky40018472011-02-26 01:02:56 +00008501Py_ssize_t
8502PyUnicode_Tailmatch(PyObject *str,
8503 PyObject *substr,
8504 Py_ssize_t start,
8505 Py_ssize_t end,
8506 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008508 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008509
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 str = PyUnicode_FromObject(str);
8511 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 substr = PyUnicode_FromObject(substr);
8514 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 Py_DECREF(str);
8516 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517 }
Tim Petersced69f82003-09-16 20:30:58 +00008518
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 (PyUnicodeObject *)substr,
8521 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 Py_DECREF(str);
8523 Py_DECREF(substr);
8524 return result;
8525}
8526
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527/* Apply fixfct filter to the Unicode object self and return a
8528 reference to the modified object */
8529
Alexander Belopolsky40018472011-02-26 01:02:56 +00008530static PyObject *
8531fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 PyObject *u;
8535 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 if (PyUnicode_READY(self) == -1)
8538 return NULL;
8539 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8540 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8541 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8546 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 /* fix functions return the new maximum character in a string,
8549 if the kind of the resulting unicode object does not change,
8550 everything is fine. Otherwise we need to change the string kind
8551 and re-run the fix function. */
8552 maxchar_new = fixfct((PyUnicodeObject*)u);
8553 if (maxchar_new == 0)
8554 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8555 else if (maxchar_new <= 127)
8556 maxchar_new = 127;
8557 else if (maxchar_new <= 255)
8558 maxchar_new = 255;
8559 else if (maxchar_new <= 65535)
8560 maxchar_new = 65535;
8561 else
8562 maxchar_new = 1114111; /* 0x10ffff */
8563
8564 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 /* fixfct should return TRUE if it modified the buffer. If
8566 FALSE, return a reference to the original buffer instead
8567 (to save space, not time) */
8568 Py_INCREF(self);
8569 Py_DECREF(u);
8570 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 else if (maxchar_new == maxchar_old) {
8573 return u;
8574 }
8575 else {
8576 /* In case the maximum character changed, we need to
8577 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008578 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 if (v == NULL) {
8580 Py_DECREF(u);
8581 return NULL;
8582 }
8583 if (maxchar_new > maxchar_old) {
8584 /* If the maxchar increased so that the kind changed, not all
8585 characters are representable anymore and we need to fix the
8586 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008587 if (PyUnicode_CopyCharacters(v, 0,
8588 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008589 PyUnicode_GET_LENGTH(self)) < 0)
8590 {
8591 Py_DECREF(u);
8592 return NULL;
8593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 maxchar_old = fixfct((PyUnicodeObject*)v);
8595 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8596 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008597 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008598 if (PyUnicode_CopyCharacters(v, 0,
8599 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008600 PyUnicode_GET_LENGTH(self)) < 0)
8601 {
8602 Py_DECREF(u);
8603 return NULL;
8604 }
8605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606
8607 Py_DECREF(u);
8608 return v;
8609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610}
8611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008613fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 /* No need to call PyUnicode_READY(self) because this function is only
8616 called as a callback from fixup() which does it already. */
8617 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8618 const int kind = PyUnicode_KIND(self);
8619 void *data = PyUnicode_DATA(self);
8620 int touched = 0;
8621 Py_UCS4 maxchar = 0;
8622 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 for (i = 0; i < len; ++i) {
8625 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8626 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8627 if (up != ch) {
8628 if (up > maxchar)
8629 maxchar = up;
8630 PyUnicode_WRITE(kind, data, i, up);
8631 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 else if (ch > maxchar)
8634 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 if (touched)
8638 return maxchar;
8639 else
8640 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641}
8642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8647 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8648 const int kind = PyUnicode_KIND(self);
8649 void *data = PyUnicode_DATA(self);
8650 int touched = 0;
8651 Py_UCS4 maxchar = 0;
8652 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 for(i = 0; i < len; ++i) {
8655 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8656 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8657 if (lo != ch) {
8658 if (lo > maxchar)
8659 maxchar = lo;
8660 PyUnicode_WRITE(kind, data, i, lo);
8661 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 else if (ch > maxchar)
8664 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
8666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 if (touched)
8668 return maxchar;
8669 else
8670 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671}
8672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8677 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8678 const int kind = PyUnicode_KIND(self);
8679 void *data = PyUnicode_DATA(self);
8680 int touched = 0;
8681 Py_UCS4 maxchar = 0;
8682 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 for(i = 0; i < len; ++i) {
8685 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8686 Py_UCS4 nu = 0;
8687
8688 if (Py_UNICODE_ISUPPER(ch))
8689 nu = Py_UNICODE_TOLOWER(ch);
8690 else if (Py_UNICODE_ISLOWER(ch))
8691 nu = Py_UNICODE_TOUPPER(ch);
8692
8693 if (nu != 0) {
8694 if (nu > maxchar)
8695 maxchar = nu;
8696 PyUnicode_WRITE(kind, data, i, nu);
8697 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 else if (ch > maxchar)
8700 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 }
8702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 if (touched)
8704 return maxchar;
8705 else
8706 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707}
8708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008710fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8713 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8714 const int kind = PyUnicode_KIND(self);
8715 void *data = PyUnicode_DATA(self);
8716 int touched = 0;
8717 Py_UCS4 maxchar = 0;
8718 Py_ssize_t i = 0;
8719 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008720
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008721 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723
8724 ch = PyUnicode_READ(kind, data, i);
8725 if (!Py_UNICODE_ISUPPER(ch)) {
8726 maxchar = Py_UNICODE_TOUPPER(ch);
8727 PyUnicode_WRITE(kind, data, i, maxchar);
8728 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 ++i;
8731 for(; i < len; ++i) {
8732 ch = PyUnicode_READ(kind, data, i);
8733 if (!Py_UNICODE_ISLOWER(ch)) {
8734 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8735 if (lo > maxchar)
8736 maxchar = lo;
8737 PyUnicode_WRITE(kind, data, i, lo);
8738 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 else if (ch > maxchar)
8741 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743
8744 if (touched)
8745 return maxchar;
8746 else
8747 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748}
8749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008751fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8754 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8755 const int kind = PyUnicode_KIND(self);
8756 void *data = PyUnicode_DATA(self);
8757 Py_UCS4 maxchar = 0;
8758 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 int previous_is_cased;
8760
8761 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 if (len == 1) {
8763 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8764 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8765 if (ti != ch) {
8766 PyUnicode_WRITE(kind, data, i, ti);
8767 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 }
8769 else
8770 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 for(; i < len; ++i) {
8774 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8775 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008776
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 nu = Py_UNICODE_TOTITLE(ch);
8781
8782 if (nu > maxchar)
8783 maxchar = nu;
8784 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008785
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 if (Py_UNICODE_ISLOWER(ch) ||
8787 Py_UNICODE_ISUPPER(ch) ||
8788 Py_UNICODE_ISTITLE(ch))
8789 previous_is_cased = 1;
8790 else
8791 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794}
8795
Tim Peters8ce9f162004-08-27 01:49:32 +00008796PyObject *
8797PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008800 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008802 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008803 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8804 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008805 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 Py_ssize_t sz, i, res_offset;
8807 Py_UCS4 maxchar = 0;
8808 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809
Tim Peters05eba1f2004-08-27 21:32:02 +00008810 fseq = PySequence_Fast(seq, "");
8811 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008812 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008813 }
8814
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008815 /* NOTE: the following code can't call back into Python code,
8816 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008817 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008818
Tim Peters05eba1f2004-08-27 21:32:02 +00008819 seqlen = PySequence_Fast_GET_SIZE(fseq);
8820 /* If empty sequence, return u"". */
8821 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008823 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008824 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008825 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008826 /* If singleton sequence with an exact Unicode, return that. */
8827 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 item = items[0];
8829 if (PyUnicode_CheckExact(item)) {
8830 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 goto Done;
8833 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008834 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008835 else {
8836 /* Set up sep and seplen */
8837 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 /* fall back to a blank space separator */
8839 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008840 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008842 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008843 else {
8844 if (!PyUnicode_Check(separator)) {
8845 PyErr_Format(PyExc_TypeError,
8846 "separator: expected str instance,"
8847 " %.80s found",
8848 Py_TYPE(separator)->tp_name);
8849 goto onError;
8850 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008851 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 goto onError;
8853 sep = separator;
8854 seplen = PyUnicode_GET_LENGTH(separator);
8855 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8856 /* inc refcount to keep this code path symetric with the
8857 above case of a blank separator */
8858 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008859 }
8860 }
8861
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008862 /* There are at least two things to join, or else we have a subclass
8863 * of str in the sequence.
8864 * Do a pre-pass to figure out the total amount of space we'll
8865 * need (sz), and see whether all argument are strings.
8866 */
8867 sz = 0;
8868 for (i = 0; i < seqlen; i++) {
8869 const Py_ssize_t old_sz = sz;
8870 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 if (!PyUnicode_Check(item)) {
8872 PyErr_Format(PyExc_TypeError,
8873 "sequence item %zd: expected str instance,"
8874 " %.80s found",
8875 i, Py_TYPE(item)->tp_name);
8876 goto onError;
8877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 if (PyUnicode_READY(item) == -1)
8879 goto onError;
8880 sz += PyUnicode_GET_LENGTH(item);
8881 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8882 if (item_maxchar > maxchar)
8883 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008884 if (i != 0)
8885 sz += seplen;
8886 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8887 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008889 goto onError;
8890 }
8891 }
Tim Petersced69f82003-09-16 20:30:58 +00008892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008894 if (res == NULL)
8895 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008896
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008897 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008899 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008900 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008902 if (i && seplen != 0) {
8903 copied = PyUnicode_CopyCharacters(res, res_offset,
8904 sep, 0, seplen);
8905 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008906 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008907#ifdef Py_DEBUG
8908 res_offset += copied;
8909#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008911#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008913 itemlen = PyUnicode_GET_LENGTH(item);
8914 if (itemlen != 0) {
8915 copied = PyUnicode_CopyCharacters(res, res_offset,
8916 item, 0, itemlen);
8917 if (copied < 0)
8918 goto onError;
8919#ifdef Py_DEBUG
8920 res_offset += copied;
8921#else
8922 res_offset += itemlen;
8923#endif
8924 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008927
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008929 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 Py_XDECREF(sep);
8931 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008934 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008936 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937 return NULL;
8938}
8939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940#define FILL(kind, data, value, start, length) \
8941 do { \
8942 Py_ssize_t i_ = 0; \
8943 assert(kind != PyUnicode_WCHAR_KIND); \
8944 switch ((kind)) { \
8945 case PyUnicode_1BYTE_KIND: { \
8946 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8947 memset(to_, (unsigned char)value, length); \
8948 break; \
8949 } \
8950 case PyUnicode_2BYTE_KIND: { \
8951 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8952 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8953 break; \
8954 } \
8955 default: { \
8956 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8957 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8958 break; \
8959 } \
8960 } \
8961 } while (0)
8962
Alexander Belopolsky40018472011-02-26 01:02:56 +00008963static PyUnicodeObject *
8964pad(PyUnicodeObject *self,
8965 Py_ssize_t left,
8966 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 PyObject *u;
8970 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008971 int kind;
8972 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973
8974 if (left < 0)
8975 left = 0;
8976 if (right < 0)
8977 right = 0;
8978
Tim Peters7a29bd52001-09-12 03:03:31 +00008979 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 Py_INCREF(self);
8981 return self;
8982 }
8983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8985 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008986 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8987 return NULL;
8988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8990 if (fill > maxchar)
8991 maxchar = fill;
8992 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008993 if (!u)
8994 return NULL;
8995
8996 kind = PyUnicode_KIND(u);
8997 data = PyUnicode_DATA(u);
8998 if (left)
8999 FILL(kind, data, fill, 0, left);
9000 if (right)
9001 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009002 if (PyUnicode_CopyCharacters(u, left,
9003 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009004 _PyUnicode_LENGTH(self)) < 0)
9005 {
9006 Py_DECREF(u);
9007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 }
9009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
Alexander Belopolsky40018472011-02-26 01:02:56 +00009014PyObject *
9015PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018
9019 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 switch(PyUnicode_KIND(string)) {
9024 case PyUnicode_1BYTE_KIND:
9025 list = ucs1lib_splitlines(
9026 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9027 PyUnicode_GET_LENGTH(string), keepends);
9028 break;
9029 case PyUnicode_2BYTE_KIND:
9030 list = ucs2lib_splitlines(
9031 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9032 PyUnicode_GET_LENGTH(string), keepends);
9033 break;
9034 case PyUnicode_4BYTE_KIND:
9035 list = ucs4lib_splitlines(
9036 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9037 PyUnicode_GET_LENGTH(string), keepends);
9038 break;
9039 default:
9040 assert(0);
9041 list = 0;
9042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 Py_DECREF(string);
9044 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045}
9046
Alexander Belopolsky40018472011-02-26 01:02:56 +00009047static PyObject *
9048split(PyUnicodeObject *self,
9049 PyUnicodeObject *substring,
9050 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 int kind1, kind2, kind;
9053 void *buf1, *buf2;
9054 Py_ssize_t len1, len2;
9055 PyObject* out;
9056
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009058 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (PyUnicode_READY(self) == -1)
9061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 if (substring == NULL)
9064 switch(PyUnicode_KIND(self)) {
9065 case PyUnicode_1BYTE_KIND:
9066 return ucs1lib_split_whitespace(
9067 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9068 PyUnicode_GET_LENGTH(self), maxcount
9069 );
9070 case PyUnicode_2BYTE_KIND:
9071 return ucs2lib_split_whitespace(
9072 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9073 PyUnicode_GET_LENGTH(self), maxcount
9074 );
9075 case PyUnicode_4BYTE_KIND:
9076 return ucs4lib_split_whitespace(
9077 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9078 PyUnicode_GET_LENGTH(self), maxcount
9079 );
9080 default:
9081 assert(0);
9082 return NULL;
9083 }
9084
9085 if (PyUnicode_READY(substring) == -1)
9086 return NULL;
9087
9088 kind1 = PyUnicode_KIND(self);
9089 kind2 = PyUnicode_KIND(substring);
9090 kind = kind1 > kind2 ? kind1 : kind2;
9091 buf1 = PyUnicode_DATA(self);
9092 buf2 = PyUnicode_DATA(substring);
9093 if (kind1 != kind)
9094 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9095 if (!buf1)
9096 return NULL;
9097 if (kind2 != kind)
9098 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9099 if (!buf2) {
9100 if (kind1 != kind) PyMem_Free(buf1);
9101 return NULL;
9102 }
9103 len1 = PyUnicode_GET_LENGTH(self);
9104 len2 = PyUnicode_GET_LENGTH(substring);
9105
9106 switch(kind) {
9107 case PyUnicode_1BYTE_KIND:
9108 out = ucs1lib_split(
9109 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9110 break;
9111 case PyUnicode_2BYTE_KIND:
9112 out = ucs2lib_split(
9113 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9114 break;
9115 case PyUnicode_4BYTE_KIND:
9116 out = ucs4lib_split(
9117 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9118 break;
9119 default:
9120 out = NULL;
9121 }
9122 if (kind1 != kind)
9123 PyMem_Free(buf1);
9124 if (kind2 != kind)
9125 PyMem_Free(buf2);
9126 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127}
9128
Alexander Belopolsky40018472011-02-26 01:02:56 +00009129static PyObject *
9130rsplit(PyUnicodeObject *self,
9131 PyUnicodeObject *substring,
9132 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 int kind1, kind2, kind;
9135 void *buf1, *buf2;
9136 Py_ssize_t len1, len2;
9137 PyObject* out;
9138
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009139 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009140 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 if (PyUnicode_READY(self) == -1)
9143 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 if (substring == NULL)
9146 switch(PyUnicode_KIND(self)) {
9147 case PyUnicode_1BYTE_KIND:
9148 return ucs1lib_rsplit_whitespace(
9149 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9150 PyUnicode_GET_LENGTH(self), maxcount
9151 );
9152 case PyUnicode_2BYTE_KIND:
9153 return ucs2lib_rsplit_whitespace(
9154 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9155 PyUnicode_GET_LENGTH(self), maxcount
9156 );
9157 case PyUnicode_4BYTE_KIND:
9158 return ucs4lib_rsplit_whitespace(
9159 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9160 PyUnicode_GET_LENGTH(self), maxcount
9161 );
9162 default:
9163 assert(0);
9164 return NULL;
9165 }
9166
9167 if (PyUnicode_READY(substring) == -1)
9168 return NULL;
9169
9170 kind1 = PyUnicode_KIND(self);
9171 kind2 = PyUnicode_KIND(substring);
9172 kind = kind1 > kind2 ? kind1 : kind2;
9173 buf1 = PyUnicode_DATA(self);
9174 buf2 = PyUnicode_DATA(substring);
9175 if (kind1 != kind)
9176 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9177 if (!buf1)
9178 return NULL;
9179 if (kind2 != kind)
9180 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9181 if (!buf2) {
9182 if (kind1 != kind) PyMem_Free(buf1);
9183 return NULL;
9184 }
9185 len1 = PyUnicode_GET_LENGTH(self);
9186 len2 = PyUnicode_GET_LENGTH(substring);
9187
9188 switch(kind) {
9189 case PyUnicode_1BYTE_KIND:
9190 out = ucs1lib_rsplit(
9191 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9192 break;
9193 case PyUnicode_2BYTE_KIND:
9194 out = ucs2lib_rsplit(
9195 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9196 break;
9197 case PyUnicode_4BYTE_KIND:
9198 out = ucs4lib_rsplit(
9199 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9200 break;
9201 default:
9202 out = NULL;
9203 }
9204 if (kind1 != kind)
9205 PyMem_Free(buf1);
9206 if (kind2 != kind)
9207 PyMem_Free(buf2);
9208 return out;
9209}
9210
9211static Py_ssize_t
9212anylib_find(int kind, void *buf1, Py_ssize_t len1,
9213 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9214{
9215 switch(kind) {
9216 case PyUnicode_1BYTE_KIND:
9217 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9218 case PyUnicode_2BYTE_KIND:
9219 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9220 case PyUnicode_4BYTE_KIND:
9221 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9222 }
9223 assert(0);
9224 return -1;
9225}
9226
9227static Py_ssize_t
9228anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9229 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9230{
9231 switch(kind) {
9232 case PyUnicode_1BYTE_KIND:
9233 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9234 case PyUnicode_2BYTE_KIND:
9235 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9236 case PyUnicode_4BYTE_KIND:
9237 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9238 }
9239 assert(0);
9240 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009241}
9242
Alexander Belopolsky40018472011-02-26 01:02:56 +00009243static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244replace(PyObject *self, PyObject *str1,
9245 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 PyObject *u;
9248 char *sbuf = PyUnicode_DATA(self);
9249 char *buf1 = PyUnicode_DATA(str1);
9250 char *buf2 = PyUnicode_DATA(str2);
9251 int srelease = 0, release1 = 0, release2 = 0;
9252 int skind = PyUnicode_KIND(self);
9253 int kind1 = PyUnicode_KIND(str1);
9254 int kind2 = PyUnicode_KIND(str2);
9255 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9256 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9257 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258
9259 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009262 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 if (skind < kind1)
9265 /* substring too wide to be present */
9266 goto nothing;
9267
9268 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009269 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009270 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009272 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009274 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 Py_UCS4 u1, u2, maxchar;
9276 int mayshrink, rkind;
9277 u1 = PyUnicode_READ_CHAR(str1, 0);
9278 if (!findchar(sbuf, PyUnicode_KIND(self),
9279 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009280 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 u2 = PyUnicode_READ_CHAR(str2, 0);
9282 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9283 /* Replacing u1 with u2 may cause a maxchar reduction in the
9284 result string. */
9285 mayshrink = maxchar > 127;
9286 if (u2 > maxchar) {
9287 maxchar = u2;
9288 mayshrink = 0;
9289 }
9290 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009291 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009293 if (PyUnicode_CopyCharacters(u, 0,
9294 (PyObject*)self, 0, slen) < 0)
9295 {
9296 Py_DECREF(u);
9297 return NULL;
9298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 rkind = PyUnicode_KIND(u);
9300 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9301 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009302 if (--maxcount < 0)
9303 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 if (mayshrink) {
9307 PyObject *tmp = u;
9308 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9309 PyUnicode_GET_LENGTH(tmp));
9310 Py_DECREF(tmp);
9311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 int rkind = skind;
9314 char *res;
9315 if (kind1 < rkind) {
9316 /* widen substring */
9317 buf1 = _PyUnicode_AsKind(str1, rkind);
9318 if (!buf1) goto error;
9319 release1 = 1;
9320 }
9321 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009322 if (i < 0)
9323 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 if (rkind > kind2) {
9325 /* widen replacement */
9326 buf2 = _PyUnicode_AsKind(str2, rkind);
9327 if (!buf2) goto error;
9328 release2 = 1;
9329 }
9330 else if (rkind < kind2) {
9331 /* widen self and buf1 */
9332 rkind = kind2;
9333 if (release1) PyMem_Free(buf1);
9334 sbuf = _PyUnicode_AsKind(self, rkind);
9335 if (!sbuf) goto error;
9336 srelease = 1;
9337 buf1 = _PyUnicode_AsKind(str1, rkind);
9338 if (!buf1) goto error;
9339 release1 = 1;
9340 }
9341 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9342 if (!res) {
9343 PyErr_NoMemory();
9344 goto error;
9345 }
9346 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009347 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9349 buf2,
9350 PyUnicode_KIND_SIZE(rkind, len2));
9351 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009352
9353 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9355 slen-i,
9356 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009357 if (i == -1)
9358 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9360 buf2,
9361 PyUnicode_KIND_SIZE(rkind, len2));
9362 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364
9365 u = PyUnicode_FromKindAndData(rkind, res, slen);
9366 PyMem_Free(res);
9367 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 Py_ssize_t n, i, j, ires;
9372 Py_ssize_t product, new_size;
9373 int rkind = skind;
9374 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 if (kind1 < rkind) {
9377 buf1 = _PyUnicode_AsKind(str1, rkind);
9378 if (!buf1) goto error;
9379 release1 = 1;
9380 }
9381 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009382 if (n == 0)
9383 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 if (kind2 < rkind) {
9385 buf2 = _PyUnicode_AsKind(str2, rkind);
9386 if (!buf2) goto error;
9387 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 else if (kind2 > rkind) {
9390 rkind = kind2;
9391 sbuf = _PyUnicode_AsKind(self, rkind);
9392 if (!sbuf) goto error;
9393 srelease = 1;
9394 if (release1) PyMem_Free(buf1);
9395 buf1 = _PyUnicode_AsKind(str1, rkind);
9396 if (!buf1) goto error;
9397 release1 = 1;
9398 }
9399 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9400 PyUnicode_GET_LENGTH(str1))); */
9401 product = n * (len2-len1);
9402 if ((product / (len2-len1)) != n) {
9403 PyErr_SetString(PyExc_OverflowError,
9404 "replace string is too long");
9405 goto error;
9406 }
9407 new_size = slen + product;
9408 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9409 PyErr_SetString(PyExc_OverflowError,
9410 "replace string is too long");
9411 goto error;
9412 }
9413 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9414 if (!res)
9415 goto error;
9416 ires = i = 0;
9417 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009418 while (n-- > 0) {
9419 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 j = anylib_find(rkind,
9421 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9422 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009423 if (j == -1)
9424 break;
9425 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009426 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9428 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9429 PyUnicode_KIND_SIZE(rkind, j-i));
9430 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009431 }
9432 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 if (len2 > 0) {
9434 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9435 buf2,
9436 PyUnicode_KIND_SIZE(rkind, len2));
9437 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009442 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9444 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9445 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009446 } else {
9447 /* interleave */
9448 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9450 buf2,
9451 PyUnicode_KIND_SIZE(rkind, len2));
9452 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009453 if (--n <= 0)
9454 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9456 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9457 PyUnicode_KIND_SIZE(rkind, 1));
9458 ires++;
9459 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9462 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9463 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009466 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 if (srelease)
9469 PyMem_FREE(sbuf);
9470 if (release1)
9471 PyMem_FREE(buf1);
9472 if (release2)
9473 PyMem_FREE(buf2);
9474 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009475
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009477 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 if (srelease)
9479 PyMem_FREE(sbuf);
9480 if (release1)
9481 PyMem_FREE(buf1);
9482 if (release2)
9483 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009484 if (PyUnicode_CheckExact(self)) {
9485 Py_INCREF(self);
9486 return (PyObject *) self;
9487 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009488 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 error:
9490 if (srelease && sbuf)
9491 PyMem_FREE(sbuf);
9492 if (release1 && buf1)
9493 PyMem_FREE(buf1);
9494 if (release2 && buf2)
9495 PyMem_FREE(buf2);
9496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497}
9498
9499/* --- Unicode Object Methods --------------------------------------------- */
9500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009501PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503\n\
9504Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009505characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506
9507static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009508unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 return fixup(self, fixtitle);
9511}
9512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009513PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515\n\
9516Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009517have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518
9519static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009520unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 return fixup(self, fixcapitalize);
9523}
9524
9525#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009526PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528\n\
9529Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009530normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531
9532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009533unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534{
9535 PyObject *list;
9536 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009537 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 /* Split into words */
9540 list = split(self, NULL, -1);
9541 if (!list)
9542 return NULL;
9543
9544 /* Capitalize each word */
9545 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9546 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 if (item == NULL)
9549 goto onError;
9550 Py_DECREF(PyList_GET_ITEM(list, i));
9551 PyList_SET_ITEM(list, i, item);
9552 }
9553
9554 /* Join the words to form a new string */
9555 item = PyUnicode_Join(NULL, list);
9556
Benjamin Peterson29060642009-01-31 22:14:21 +00009557 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 Py_DECREF(list);
9559 return (PyObject *)item;
9560}
9561#endif
9562
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009563/* Argument converter. Coerces to a single unicode character */
9564
9565static int
9566convert_uc(PyObject *obj, void *addr)
9567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009569 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009570
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 uniobj = PyUnicode_FromObject(obj);
9572 if (uniobj == NULL) {
9573 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009575 return 0;
9576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009578 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009580 Py_DECREF(uniobj);
9581 return 0;
9582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009584 Py_DECREF(uniobj);
9585 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009586}
9587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009588PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009591Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009592done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593
9594static PyObject *
9595unicode_center(PyUnicodeObject *self, PyObject *args)
9596{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009597 Py_ssize_t marg, left;
9598 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 Py_UCS4 fillchar = ' ';
9600
Victor Stinnere9a29352011-10-01 02:14:59 +02009601 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603
Victor Stinnere9a29352011-10-01 02:14:59 +02009604 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 return NULL;
9606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 Py_INCREF(self);
9609 return (PyObject*) self;
9610 }
9611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613 left = marg / 2 + (marg & width & 1);
9614
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009615 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616}
9617
Marc-André Lemburge5034372000-08-08 08:04:29 +00009618#if 0
9619
9620/* This code should go into some future Unicode collation support
9621 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009622 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009623
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009624/* speedy UTF-16 code point order comparison */
9625/* gleaned from: */
9626/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9627
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009628static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009629{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009630 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009631 0, 0, 0, 0, 0, 0, 0, 0,
9632 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009633 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009634};
9635
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636static int
9637unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9638{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009639 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009640
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641 Py_UNICODE *s1 = str1->str;
9642 Py_UNICODE *s2 = str2->str;
9643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 len1 = str1->_base._base.length;
9645 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009646
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009648 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009649
9650 c1 = *s1++;
9651 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009652
Benjamin Peterson29060642009-01-31 22:14:21 +00009653 if (c1 > (1<<11) * 26)
9654 c1 += utf16Fixup[c1>>11];
9655 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009656 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009657 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009658
9659 if (c1 != c2)
9660 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009661
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009662 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663 }
9664
9665 return (len1 < len2) ? -1 : (len1 != len2);
9666}
9667
Marc-André Lemburge5034372000-08-08 08:04:29 +00009668#else
9669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670/* This function assumes that str1 and str2 are readied by the caller. */
9671
Marc-André Lemburge5034372000-08-08 08:04:29 +00009672static int
9673unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 int kind1, kind2;
9676 void *data1, *data2;
9677 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 kind1 = PyUnicode_KIND(str1);
9680 kind2 = PyUnicode_KIND(str2);
9681 data1 = PyUnicode_DATA(str1);
9682 data2 = PyUnicode_DATA(str2);
9683 len1 = PyUnicode_GET_LENGTH(str1);
9684 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 for (i = 0; i < len1 && i < len2; ++i) {
9687 Py_UCS4 c1, c2;
9688 c1 = PyUnicode_READ(kind1, data1, i);
9689 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009690
9691 if (c1 != c2)
9692 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009693 }
9694
9695 return (len1 < len2) ? -1 : (len1 != len2);
9696}
9697
9698#endif
9699
Alexander Belopolsky40018472011-02-26 01:02:56 +00009700int
9701PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9704 if (PyUnicode_READY(left) == -1 ||
9705 PyUnicode_READY(right) == -1)
9706 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009707 return unicode_compare((PyUnicodeObject *)left,
9708 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009710 PyErr_Format(PyExc_TypeError,
9711 "Can't compare %.100s and %.100s",
9712 left->ob_type->tp_name,
9713 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 return -1;
9715}
9716
Martin v. Löwis5b222132007-06-10 09:51:05 +00009717int
9718PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 Py_ssize_t i;
9721 int kind;
9722 void *data;
9723 Py_UCS4 chr;
9724
Victor Stinner910337b2011-10-03 03:20:16 +02009725 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 if (PyUnicode_READY(uni) == -1)
9727 return -1;
9728 kind = PyUnicode_KIND(uni);
9729 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009730 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9732 if (chr != str[i])
9733 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009734 /* This check keeps Python strings that end in '\0' from comparing equal
9735 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009738 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009740 return 0;
9741}
9742
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009743
Benjamin Peterson29060642009-01-31 22:14:21 +00009744#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009746
Alexander Belopolsky40018472011-02-26 01:02:56 +00009747PyObject *
9748PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009749{
9750 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009751
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009752 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9753 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 if (PyUnicode_READY(left) == -1 ||
9755 PyUnicode_READY(right) == -1)
9756 return NULL;
9757 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9758 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009759 if (op == Py_EQ) {
9760 Py_INCREF(Py_False);
9761 return Py_False;
9762 }
9763 if (op == Py_NE) {
9764 Py_INCREF(Py_True);
9765 return Py_True;
9766 }
9767 }
9768 if (left == right)
9769 result = 0;
9770 else
9771 result = unicode_compare((PyUnicodeObject *)left,
9772 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009773
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009774 /* Convert the return value to a Boolean */
9775 switch (op) {
9776 case Py_EQ:
9777 v = TEST_COND(result == 0);
9778 break;
9779 case Py_NE:
9780 v = TEST_COND(result != 0);
9781 break;
9782 case Py_LE:
9783 v = TEST_COND(result <= 0);
9784 break;
9785 case Py_GE:
9786 v = TEST_COND(result >= 0);
9787 break;
9788 case Py_LT:
9789 v = TEST_COND(result == -1);
9790 break;
9791 case Py_GT:
9792 v = TEST_COND(result == 1);
9793 break;
9794 default:
9795 PyErr_BadArgument();
9796 return NULL;
9797 }
9798 Py_INCREF(v);
9799 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009801
Brian Curtindfc80e32011-08-10 20:28:54 -05009802 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009803}
9804
Alexander Belopolsky40018472011-02-26 01:02:56 +00009805int
9806PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009807{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009808 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 int kind1, kind2, kind;
9810 void *buf1, *buf2;
9811 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009812 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009813
9814 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009815 sub = PyUnicode_FromObject(element);
9816 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009817 PyErr_Format(PyExc_TypeError,
9818 "'in <string>' requires string as left operand, not %s",
9819 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009820 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (PyUnicode_READY(sub) == -1)
9823 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009824
Thomas Wouters477c8d52006-05-27 19:21:47 +00009825 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009826 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009827 Py_DECREF(sub);
9828 return -1;
9829 }
9830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 kind1 = PyUnicode_KIND(str);
9832 kind2 = PyUnicode_KIND(sub);
9833 kind = kind1 > kind2 ? kind1 : kind2;
9834 buf1 = PyUnicode_DATA(str);
9835 buf2 = PyUnicode_DATA(sub);
9836 if (kind1 != kind)
9837 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9838 if (!buf1) {
9839 Py_DECREF(sub);
9840 return -1;
9841 }
9842 if (kind2 != kind)
9843 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9844 if (!buf2) {
9845 Py_DECREF(sub);
9846 if (kind1 != kind) PyMem_Free(buf1);
9847 return -1;
9848 }
9849 len1 = PyUnicode_GET_LENGTH(str);
9850 len2 = PyUnicode_GET_LENGTH(sub);
9851
9852 switch(kind) {
9853 case PyUnicode_1BYTE_KIND:
9854 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9855 break;
9856 case PyUnicode_2BYTE_KIND:
9857 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9858 break;
9859 case PyUnicode_4BYTE_KIND:
9860 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9861 break;
9862 default:
9863 result = -1;
9864 assert(0);
9865 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009866
9867 Py_DECREF(str);
9868 Py_DECREF(sub);
9869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 if (kind1 != kind)
9871 PyMem_Free(buf1);
9872 if (kind2 != kind)
9873 PyMem_Free(buf2);
9874
Guido van Rossum403d68b2000-03-13 15:55:09 +00009875 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009876}
9877
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878/* Concat to string or Unicode object giving a new Unicode object. */
9879
Alexander Belopolsky40018472011-02-26 01:02:56 +00009880PyObject *
9881PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 PyObject *u = NULL, *v = NULL, *w;
9884 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885
9886 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009889 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893
9894 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009895 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009896 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009899 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 }
9903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009905 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 w = PyUnicode_New(
9909 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9910 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009913 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9914 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009915 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009916 v, 0,
9917 PyUnicode_GET_LENGTH(v)) < 0)
9918 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919 Py_DECREF(u);
9920 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922
Benjamin Peterson29060642009-01-31 22:14:21 +00009923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 Py_XDECREF(u);
9925 Py_XDECREF(v);
9926 return NULL;
9927}
9928
Walter Dörwald1ab83302007-05-18 17:15:44 +00009929void
Victor Stinner23e56682011-10-03 03:54:37 +02009930PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009931{
Victor Stinner23e56682011-10-03 03:54:37 +02009932 PyObject *left, *res;
9933
9934 if (p_left == NULL) {
9935 if (!PyErr_Occurred())
9936 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009937 return;
9938 }
Victor Stinner23e56682011-10-03 03:54:37 +02009939 left = *p_left;
9940 if (right == NULL || !PyUnicode_Check(left)) {
9941 if (!PyErr_Occurred())
9942 PyErr_BadInternalCall();
9943 goto error;
9944 }
9945
9946 if (PyUnicode_CheckExact(left) && left != unicode_empty
9947 && PyUnicode_CheckExact(right) && right != unicode_empty
9948 && unicode_resizable(left)
9949 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9950 || _PyUnicode_WSTR(left) != NULL))
9951 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009952 Py_ssize_t left_len, right_len, new_len;
9953#ifdef Py_DEBUG
9954 Py_ssize_t copied;
9955#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009956
Victor Stinner23e56682011-10-03 03:54:37 +02009957 if (PyUnicode_READY(left))
9958 goto error;
9959 if (PyUnicode_READY(right))
9960 goto error;
9961
9962 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9963 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9964 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009965 left_len = PyUnicode_GET_LENGTH(left);
9966 right_len = PyUnicode_GET_LENGTH(right);
9967 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner23e56682011-10-03 03:54:37 +02009968 PyErr_SetString(PyExc_OverflowError,
9969 "strings are too large to concat");
9970 goto error;
9971 }
Victor Stinnerb8038952011-10-03 23:27:56 +02009972 new_len = left_len + right_len;
Victor Stinner23e56682011-10-03 03:54:37 +02009973
9974 /* Now we own the last reference to 'left', so we can resize it
9975 * in-place.
9976 */
9977 if (unicode_resize(&left, new_len) != 0) {
9978 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9979 * deallocated so it cannot be put back into
9980 * 'variable'. The MemoryError is raised when there
9981 * is no value in 'variable', which might (very
9982 * remotely) be a cause of incompatibilities.
9983 */
9984 goto error;
9985 }
9986 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerb8038952011-10-03 23:27:56 +02009987#ifdef Py_DEBUG
9988 copied = PyUnicode_CopyCharacters(left, left_len,
Victor Stinner23e56682011-10-03 03:54:37 +02009989 right, 0,
Victor Stinnerb8038952011-10-03 23:27:56 +02009990 right_len);
Victor Stinner23e56682011-10-03 03:54:37 +02009991 assert(0 <= copied);
Victor Stinnerb8038952011-10-03 23:27:56 +02009992#else
9993 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
9994#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009995 *p_left = left;
9996 return;
9997 }
9998 }
9999
10000 res = PyUnicode_Concat(left, right);
10001 if (res == NULL)
10002 goto error;
10003 Py_DECREF(left);
10004 *p_left = res;
10005 return;
10006
10007error:
10008 Py_DECREF(*p_left);
10009 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010010}
10011
10012void
10013PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10014{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010015 PyUnicode_Append(pleft, right);
10016 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010017}
10018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010019PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010020 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010022Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010023string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010024interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
10026static PyObject *
10027unicode_count(PyUnicodeObject *self, PyObject *args)
10028{
10029 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010030 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010031 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 int kind1, kind2, kind;
10034 void *buf1, *buf2;
10035 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Jesus Ceaac451502011-04-20 17:09:23 +020010037 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10038 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 kind1 = PyUnicode_KIND(self);
10042 kind2 = PyUnicode_KIND(substring);
10043 kind = kind1 > kind2 ? kind1 : kind2;
10044 buf1 = PyUnicode_DATA(self);
10045 buf2 = PyUnicode_DATA(substring);
10046 if (kind1 != kind)
10047 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10048 if (!buf1) {
10049 Py_DECREF(substring);
10050 return NULL;
10051 }
10052 if (kind2 != kind)
10053 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10054 if (!buf2) {
10055 Py_DECREF(substring);
10056 if (kind1 != kind) PyMem_Free(buf1);
10057 return NULL;
10058 }
10059 len1 = PyUnicode_GET_LENGTH(self);
10060 len2 = PyUnicode_GET_LENGTH(substring);
10061
10062 ADJUST_INDICES(start, end, len1);
10063 switch(kind) {
10064 case PyUnicode_1BYTE_KIND:
10065 iresult = ucs1lib_count(
10066 ((Py_UCS1*)buf1) + start, end - start,
10067 buf2, len2, PY_SSIZE_T_MAX
10068 );
10069 break;
10070 case PyUnicode_2BYTE_KIND:
10071 iresult = ucs2lib_count(
10072 ((Py_UCS2*)buf1) + start, end - start,
10073 buf2, len2, PY_SSIZE_T_MAX
10074 );
10075 break;
10076 case PyUnicode_4BYTE_KIND:
10077 iresult = ucs4lib_count(
10078 ((Py_UCS4*)buf1) + start, end - start,
10079 buf2, len2, PY_SSIZE_T_MAX
10080 );
10081 break;
10082 default:
10083 assert(0); iresult = 0;
10084 }
10085
10086 result = PyLong_FromSsize_t(iresult);
10087
10088 if (kind1 != kind)
10089 PyMem_Free(buf1);
10090 if (kind2 != kind)
10091 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092
10093 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010094
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095 return result;
10096}
10097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010098PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010099 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010101Encode S using the codec registered for encoding. Default encoding\n\
10102is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010103handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010104a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10105'xmlcharrefreplace' as well as any other name registered with\n\
10106codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
10108static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010109unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010111 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112 char *encoding = NULL;
10113 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010114
Benjamin Peterson308d6372009-09-18 21:42:35 +000010115 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10116 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010118 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010119}
10120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010121PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010122 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123\n\
10124Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010125If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126
10127static PyObject*
10128unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10129{
10130 Py_UNICODE *e;
10131 Py_UNICODE *p;
10132 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010133 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135 PyUnicodeObject *u;
10136 int tabsize = 8;
10137
10138 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10142 return NULL;
10143
Thomas Wouters7e474022000-07-16 12:04:32 +000010144 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010145 i = 0; /* chars up to and including most recent \n or \r */
10146 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10148 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 if (tabsize > 0) {
10151 incr = tabsize - (j % tabsize); /* cannot overflow */
10152 if (j > PY_SSIZE_T_MAX - incr)
10153 goto overflow1;
10154 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010155 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010158 if (j > PY_SSIZE_T_MAX - 1)
10159 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160 j++;
10161 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 if (i > PY_SSIZE_T_MAX - j)
10163 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010165 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166 }
10167 }
10168
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010169 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010170 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010171
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 /* Second pass: create output string and fill it */
10173 u = _PyUnicode_New(i + j);
10174 if (!u)
10175 return NULL;
10176
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010177 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 q = _PyUnicode_WSTR(u); /* next output char */
10179 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 if (tabsize > 0) {
10184 i = tabsize - (j % tabsize);
10185 j += i;
10186 while (i--) {
10187 if (q >= qe)
10188 goto overflow2;
10189 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010190 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 else {
10194 if (q >= qe)
10195 goto overflow2;
10196 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010197 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198 if (*p == '\n' || *p == '\r')
10199 j = 0;
10200 }
10201
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010202 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 Py_DECREF(u);
10204 return NULL;
10205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010207
10208 overflow2:
10209 Py_DECREF(u);
10210 overflow1:
10211 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213}
10214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010215PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217\n\
10218Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010219such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220arguments start and end are interpreted as in slice notation.\n\
10221\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010222Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223
10224static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226{
Jesus Ceaac451502011-04-20 17:09:23 +020010227 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010228 Py_ssize_t start;
10229 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
Jesus Ceaac451502011-04-20 17:09:23 +020010232 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10233 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 if (PyUnicode_READY(self) == -1)
10237 return NULL;
10238 if (PyUnicode_READY(substring) == -1)
10239 return NULL;
10240
10241 result = any_find_slice(
10242 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10243 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
10246 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (result == -2)
10249 return NULL;
10250
Christian Heimes217cfd12007-12-02 14:31:20 +000010251 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252}
10253
10254static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010255unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010257 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10258 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261}
10262
Guido van Rossumc2504932007-09-18 19:42:40 +000010263/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010264 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010265static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010266unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267{
Guido van Rossumc2504932007-09-18 19:42:40 +000010268 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010269 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 if (_PyUnicode_HASH(self) != -1)
10272 return _PyUnicode_HASH(self);
10273 if (PyUnicode_READY(self) == -1)
10274 return -1;
10275 len = PyUnicode_GET_LENGTH(self);
10276
10277 /* The hash function as a macro, gets expanded three times below. */
10278#define HASH(P) \
10279 x = (Py_uhash_t)*P << 7; \
10280 while (--len >= 0) \
10281 x = (1000003*x) ^ (Py_uhash_t)*P++;
10282
10283 switch (PyUnicode_KIND(self)) {
10284 case PyUnicode_1BYTE_KIND: {
10285 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10286 HASH(c);
10287 break;
10288 }
10289 case PyUnicode_2BYTE_KIND: {
10290 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10291 HASH(s);
10292 break;
10293 }
10294 default: {
10295 Py_UCS4 *l;
10296 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10297 "Impossible switch case in unicode_hash");
10298 l = PyUnicode_4BYTE_DATA(self);
10299 HASH(l);
10300 break;
10301 }
10302 }
10303 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10304
Guido van Rossumc2504932007-09-18 19:42:40 +000010305 if (x == -1)
10306 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010308 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010313 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010315Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316
10317static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010320 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010321 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010322 Py_ssize_t start;
10323 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324
Jesus Ceaac451502011-04-20 17:09:23 +020010325 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10326 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (PyUnicode_READY(self) == -1)
10330 return NULL;
10331 if (PyUnicode_READY(substring) == -1)
10332 return NULL;
10333
10334 result = any_find_slice(
10335 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10336 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338
10339 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (result == -2)
10342 return NULL;
10343
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344 if (result < 0) {
10345 PyErr_SetString(PyExc_ValueError, "substring not found");
10346 return NULL;
10347 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010348
Christian Heimes217cfd12007-12-02 14:31:20 +000010349 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350}
10351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010352PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010355Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010356at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357
10358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010359unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 Py_ssize_t i, length;
10362 int kind;
10363 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364 int cased;
10365
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 if (PyUnicode_READY(self) == -1)
10367 return NULL;
10368 length = PyUnicode_GET_LENGTH(self);
10369 kind = PyUnicode_KIND(self);
10370 data = PyUnicode_DATA(self);
10371
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (length == 1)
10374 return PyBool_FromLong(
10375 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010377 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010380
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 for (i = 0; i < length; i++) {
10383 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010384
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10386 return PyBool_FromLong(0);
10387 else if (!cased && Py_UNICODE_ISLOWER(ch))
10388 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010390 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391}
10392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010393PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010396Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010397at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398
10399static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010400unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 Py_ssize_t i, length;
10403 int kind;
10404 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 int cased;
10406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 if (PyUnicode_READY(self) == -1)
10408 return NULL;
10409 length = PyUnicode_GET_LENGTH(self);
10410 kind = PyUnicode_KIND(self);
10411 data = PyUnicode_DATA(self);
10412
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 if (length == 1)
10415 return PyBool_FromLong(
10416 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010418 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010421
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 for (i = 0; i < length; i++) {
10424 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010425
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10427 return PyBool_FromLong(0);
10428 else if (!cased && Py_UNICODE_ISUPPER(ch))
10429 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010431 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432}
10433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010434PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010435 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010437Return True if S is a titlecased string and there is at least one\n\
10438character in S, i.e. upper- and titlecase characters may only\n\
10439follow uncased characters and lowercase characters only cased ones.\n\
10440Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441
10442static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010443unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 Py_ssize_t i, length;
10446 int kind;
10447 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448 int cased, previous_is_cased;
10449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 if (PyUnicode_READY(self) == -1)
10451 return NULL;
10452 length = PyUnicode_GET_LENGTH(self);
10453 kind = PyUnicode_KIND(self);
10454 data = PyUnicode_DATA(self);
10455
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 if (length == 1) {
10458 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10459 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10460 (Py_UNICODE_ISUPPER(ch) != 0));
10461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010463 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010466
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 cased = 0;
10468 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 for (i = 0; i < length; i++) {
10470 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010471
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10473 if (previous_is_cased)
10474 return PyBool_FromLong(0);
10475 previous_is_cased = 1;
10476 cased = 1;
10477 }
10478 else if (Py_UNICODE_ISLOWER(ch)) {
10479 if (!previous_is_cased)
10480 return PyBool_FromLong(0);
10481 previous_is_cased = 1;
10482 cased = 1;
10483 }
10484 else
10485 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010487 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488}
10489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010490PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010493Return True if all characters in S are whitespace\n\
10494and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495
10496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010497unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 Py_ssize_t i, length;
10500 int kind;
10501 void *data;
10502
10503 if (PyUnicode_READY(self) == -1)
10504 return NULL;
10505 length = PyUnicode_GET_LENGTH(self);
10506 kind = PyUnicode_KIND(self);
10507 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (length == 1)
10511 return PyBool_FromLong(
10512 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010514 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 for (i = 0; i < length; i++) {
10519 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010520 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010523 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524}
10525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010526PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010528\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010529Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010530and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010531
10532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010533unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 Py_ssize_t i, length;
10536 int kind;
10537 void *data;
10538
10539 if (PyUnicode_READY(self) == -1)
10540 return NULL;
10541 length = PyUnicode_GET_LENGTH(self);
10542 kind = PyUnicode_KIND(self);
10543 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010544
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010545 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (length == 1)
10547 return PyBool_FromLong(
10548 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010549
10550 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010552 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 for (i = 0; i < length; i++) {
10555 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010556 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010557 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010558 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010559}
10560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010561PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010562 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010563\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010564Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010565and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010566
10567static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010568unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 int kind;
10571 void *data;
10572 Py_ssize_t len, i;
10573
10574 if (PyUnicode_READY(self) == -1)
10575 return NULL;
10576
10577 kind = PyUnicode_KIND(self);
10578 data = PyUnicode_DATA(self);
10579 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010580
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010581 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (len == 1) {
10583 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10584 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10585 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010586
10587 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 for (i = 0; i < len; i++) {
10592 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010593 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010594 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010595 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010596 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010597}
10598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010599PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010600 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010602Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010603False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
10605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010606unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 Py_ssize_t i, length;
10609 int kind;
10610 void *data;
10611
10612 if (PyUnicode_READY(self) == -1)
10613 return NULL;
10614 length = PyUnicode_GET_LENGTH(self);
10615 kind = PyUnicode_KIND(self);
10616 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 if (length == 1)
10620 return PyBool_FromLong(
10621 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010623 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010625 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 for (i = 0; i < length; i++) {
10628 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010631 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632}
10633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010634PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010637Return True if all characters in S are digits\n\
10638and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639
10640static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010641unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 Py_ssize_t i, length;
10644 int kind;
10645 void *data;
10646
10647 if (PyUnicode_READY(self) == -1)
10648 return NULL;
10649 length = PyUnicode_GET_LENGTH(self);
10650 kind = PyUnicode_KIND(self);
10651 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (length == 1) {
10655 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10656 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010658
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010659 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010661 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 for (i = 0; i < length; i++) {
10664 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010667 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668}
10669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010670PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010671 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010673Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010674False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010675
10676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010677unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 Py_ssize_t i, length;
10680 int kind;
10681 void *data;
10682
10683 if (PyUnicode_READY(self) == -1)
10684 return NULL;
10685 length = PyUnicode_GET_LENGTH(self);
10686 kind = PyUnicode_KIND(self);
10687 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (length == 1)
10691 return PyBool_FromLong(
10692 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010694 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010696 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 for (i = 0; i < length; i++) {
10699 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010700 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010702 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703}
10704
Martin v. Löwis47383402007-08-15 07:32:56 +000010705int
10706PyUnicode_IsIdentifier(PyObject *self)
10707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 int kind;
10709 void *data;
10710 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010711 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 if (PyUnicode_READY(self) == -1) {
10714 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 }
10717
10718 /* Special case for empty strings */
10719 if (PyUnicode_GET_LENGTH(self) == 0)
10720 return 0;
10721 kind = PyUnicode_KIND(self);
10722 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010723
10724 /* PEP 3131 says that the first character must be in
10725 XID_Start and subsequent characters in XID_Continue,
10726 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010727 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010728 letters, digits, underscore). However, given the current
10729 definition of XID_Start and XID_Continue, it is sufficient
10730 to check just for these, except that _ must be allowed
10731 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010732 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010733 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010734 return 0;
10735
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010736 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010738 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010739 return 1;
10740}
10741
10742PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010744\n\
10745Return True if S is a valid identifier according\n\
10746to the language definition.");
10747
10748static PyObject*
10749unicode_isidentifier(PyObject *self)
10750{
10751 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10752}
10753
Georg Brandl559e5d72008-06-11 18:37:52 +000010754PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010756\n\
10757Return True if all characters in S are considered\n\
10758printable in repr() or S is empty, False otherwise.");
10759
10760static PyObject*
10761unicode_isprintable(PyObject *self)
10762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 Py_ssize_t i, length;
10764 int kind;
10765 void *data;
10766
10767 if (PyUnicode_READY(self) == -1)
10768 return NULL;
10769 length = PyUnicode_GET_LENGTH(self);
10770 kind = PyUnicode_KIND(self);
10771 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010772
10773 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (length == 1)
10775 return PyBool_FromLong(
10776 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 for (i = 0; i < length; i++) {
10779 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010780 Py_RETURN_FALSE;
10781 }
10782 }
10783 Py_RETURN_TRUE;
10784}
10785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010786PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010787 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788\n\
10789Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010790iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791
10792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010793unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010795 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796}
10797
Martin v. Löwis18e16552006-02-15 17:27:45 +000010798static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799unicode_length(PyUnicodeObject *self)
10800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (PyUnicode_READY(self) == -1)
10802 return -1;
10803 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804}
10805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010806PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010807 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010809Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010810done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811
10812static PyObject *
10813unicode_ljust(PyUnicodeObject *self, PyObject *args)
10814{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010815 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 Py_UCS4 fillchar = ' ';
10817
10818 if (PyUnicode_READY(self) == -1)
10819 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010820
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010821 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 return NULL;
10823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 Py_INCREF(self);
10826 return (PyObject*) self;
10827 }
10828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830}
10831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010832PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010835Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010838unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 return fixup(self, fixlower);
10841}
10842
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010843#define LEFTSTRIP 0
10844#define RIGHTSTRIP 1
10845#define BOTHSTRIP 2
10846
10847/* Arrays indexed by above */
10848static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10849
10850#define STRIPNAME(i) (stripformat[i]+3)
10851
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010852/* externally visible for str.strip(unicode) */
10853PyObject *
10854_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10855{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 void *data;
10857 int kind;
10858 Py_ssize_t i, j, len;
10859 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10862 return NULL;
10863
10864 kind = PyUnicode_KIND(self);
10865 data = PyUnicode_DATA(self);
10866 len = PyUnicode_GET_LENGTH(self);
10867 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10868 PyUnicode_DATA(sepobj),
10869 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870
Benjamin Peterson14339b62009-01-31 16:36:08 +000010871 i = 0;
10872 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 while (i < len &&
10874 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 i++;
10876 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010878
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 j = len;
10880 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 do {
10882 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 } while (j >= i &&
10884 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010885 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010887
Victor Stinner12bab6d2011-10-01 01:53:49 +020010888 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889}
10890
10891PyObject*
10892PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10893{
10894 unsigned char *data;
10895 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010896 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897
Victor Stinnerde636f32011-10-01 03:55:54 +020010898 if (PyUnicode_READY(self) == -1)
10899 return NULL;
10900
10901 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10902
Victor Stinner12bab6d2011-10-01 01:53:49 +020010903 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010905 if (PyUnicode_CheckExact(self)) {
10906 Py_INCREF(self);
10907 return self;
10908 }
10909 else
10910 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 }
10912
Victor Stinner12bab6d2011-10-01 01:53:49 +020010913 length = end - start;
10914 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010915 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916
Victor Stinnerde636f32011-10-01 03:55:54 +020010917 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010918 PyErr_SetString(PyExc_IndexError, "string index out of range");
10919 return NULL;
10920 }
10921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 kind = PyUnicode_KIND(self);
10923 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010924 return PyUnicode_FromKindAndData(kind,
10925 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010926 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
10929static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010930do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 int kind;
10933 void *data;
10934 Py_ssize_t len, i, j;
10935
10936 if (PyUnicode_READY(self) == -1)
10937 return NULL;
10938
10939 kind = PyUnicode_KIND(self);
10940 data = PyUnicode_DATA(self);
10941 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942
Benjamin Peterson14339b62009-01-31 16:36:08 +000010943 i = 0;
10944 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010946 i++;
10947 }
10948 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010949
Benjamin Peterson14339b62009-01-31 16:36:08 +000010950 j = len;
10951 if (striptype != LEFTSTRIP) {
10952 do {
10953 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010955 j++;
10956 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010957
Victor Stinner12bab6d2011-10-01 01:53:49 +020010958 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959}
10960
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010961
10962static PyObject *
10963do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10964{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010965 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010966
Benjamin Peterson14339b62009-01-31 16:36:08 +000010967 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10968 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010969
Benjamin Peterson14339b62009-01-31 16:36:08 +000010970 if (sep != NULL && sep != Py_None) {
10971 if (PyUnicode_Check(sep))
10972 return _PyUnicode_XStrip(self, striptype, sep);
10973 else {
10974 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 "%s arg must be None or str",
10976 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010977 return NULL;
10978 }
10979 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010980
Benjamin Peterson14339b62009-01-31 16:36:08 +000010981 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010982}
10983
10984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010985PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010987\n\
10988Return a copy of the string S with leading and trailing\n\
10989whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010990If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010991
10992static PyObject *
10993unicode_strip(PyUnicodeObject *self, PyObject *args)
10994{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010995 if (PyTuple_GET_SIZE(args) == 0)
10996 return do_strip(self, BOTHSTRIP); /* Common case */
10997 else
10998 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010999}
11000
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011004\n\
11005Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011006If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011007
11008static PyObject *
11009unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11010{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011011 if (PyTuple_GET_SIZE(args) == 0)
11012 return do_strip(self, LEFTSTRIP); /* Common case */
11013 else
11014 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011015}
11016
11017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011018PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011020\n\
11021Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011022If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011023
11024static PyObject *
11025unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11026{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011027 if (PyTuple_GET_SIZE(args) == 0)
11028 return do_strip(self, RIGHTSTRIP); /* Common case */
11029 else
11030 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011031}
11032
11033
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011035unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036{
11037 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Georg Brandl222de0f2009-04-12 12:01:50 +000011040 if (len < 1) {
11041 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011042 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011043 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Tim Peters7a29bd52001-09-12 03:03:31 +000011045 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046 /* no repeat, return original string */
11047 Py_INCREF(str);
11048 return (PyObject*) str;
11049 }
Tim Peters8f422462000-09-09 06:13:41 +000011050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 if (PyUnicode_READY(str) == -1)
11052 return NULL;
11053
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011054 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011055 PyErr_SetString(PyExc_OverflowError,
11056 "repeated string is too long");
11057 return NULL;
11058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 if (!u)
11063 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011064 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (PyUnicode_GET_LENGTH(str) == 1) {
11067 const int kind = PyUnicode_KIND(str);
11068 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11069 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011070 if (kind == PyUnicode_1BYTE_KIND)
11071 memset(to, (unsigned char)fill_char, len);
11072 else {
11073 for (n = 0; n < len; ++n)
11074 PyUnicode_WRITE(kind, to, n, fill_char);
11075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 }
11077 else {
11078 /* number of characters copied this far */
11079 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11080 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11081 char *to = (char *) PyUnicode_DATA(u);
11082 Py_MEMCPY(to, PyUnicode_DATA(str),
11083 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011084 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 n = (done <= nchars-done) ? done : nchars-done;
11086 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011087 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089 }
11090
11091 return (PyObject*) u;
11092}
11093
Alexander Belopolsky40018472011-02-26 01:02:56 +000011094PyObject *
11095PyUnicode_Replace(PyObject *obj,
11096 PyObject *subobj,
11097 PyObject *replobj,
11098 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099{
11100 PyObject *self;
11101 PyObject *str1;
11102 PyObject *str2;
11103 PyObject *result;
11104
11105 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011106 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011109 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011110 Py_DECREF(self);
11111 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 }
11113 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011114 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 Py_DECREF(self);
11116 Py_DECREF(str1);
11117 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120 Py_DECREF(self);
11121 Py_DECREF(str1);
11122 Py_DECREF(str2);
11123 return result;
11124}
11125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011126PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011127 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128\n\
11129Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011130old replaced by new. If the optional argument count is\n\
11131given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
11133static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 PyObject *str1;
11137 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011138 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 PyObject *result;
11140
Martin v. Löwis18e16552006-02-15 17:27:45 +000011141 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 str1 = PyUnicode_FromObject(str1);
11146 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11147 return NULL;
11148 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011149 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 Py_DECREF(str1);
11151 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153
11154 result = replace(self, str1, str2, maxcount);
11155
11156 Py_DECREF(str1);
11157 Py_DECREF(str2);
11158 return result;
11159}
11160
Alexander Belopolsky40018472011-02-26 01:02:56 +000011161static PyObject *
11162unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011164 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 Py_ssize_t isize;
11166 Py_ssize_t osize, squote, dquote, i, o;
11167 Py_UCS4 max, quote;
11168 int ikind, okind;
11169 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011172 return NULL;
11173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 isize = PyUnicode_GET_LENGTH(unicode);
11175 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 /* Compute length of output, quote characters, and
11178 maximum character */
11179 osize = 2; /* quotes */
11180 max = 127;
11181 squote = dquote = 0;
11182 ikind = PyUnicode_KIND(unicode);
11183 for (i = 0; i < isize; i++) {
11184 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11185 switch (ch) {
11186 case '\'': squote++; osize++; break;
11187 case '"': dquote++; osize++; break;
11188 case '\\': case '\t': case '\r': case '\n':
11189 osize += 2; break;
11190 default:
11191 /* Fast-path ASCII */
11192 if (ch < ' ' || ch == 0x7f)
11193 osize += 4; /* \xHH */
11194 else if (ch < 0x7f)
11195 osize++;
11196 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11197 osize++;
11198 max = ch > max ? ch : max;
11199 }
11200 else if (ch < 0x100)
11201 osize += 4; /* \xHH */
11202 else if (ch < 0x10000)
11203 osize += 6; /* \uHHHH */
11204 else
11205 osize += 10; /* \uHHHHHHHH */
11206 }
11207 }
11208
11209 quote = '\'';
11210 if (squote) {
11211 if (dquote)
11212 /* Both squote and dquote present. Use squote,
11213 and escape them */
11214 osize += squote;
11215 else
11216 quote = '"';
11217 }
11218
11219 repr = PyUnicode_New(osize, max);
11220 if (repr == NULL)
11221 return NULL;
11222 okind = PyUnicode_KIND(repr);
11223 odata = PyUnicode_DATA(repr);
11224
11225 PyUnicode_WRITE(okind, odata, 0, quote);
11226 PyUnicode_WRITE(okind, odata, osize-1, quote);
11227
11228 for (i = 0, o = 1; i < isize; i++) {
11229 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011230
11231 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 if ((ch == quote) || (ch == '\\')) {
11233 PyUnicode_WRITE(okind, odata, o++, '\\');
11234 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011235 continue;
11236 }
11237
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011239 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 PyUnicode_WRITE(okind, odata, o++, '\\');
11241 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011242 }
11243 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244 PyUnicode_WRITE(okind, odata, o++, '\\');
11245 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011246 }
11247 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 PyUnicode_WRITE(okind, odata, o++, '\\');
11249 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011250 }
11251
11252 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011253 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 PyUnicode_WRITE(okind, odata, o++, '\\');
11255 PyUnicode_WRITE(okind, odata, o++, 'x');
11256 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11257 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011258 }
11259
Georg Brandl559e5d72008-06-11 18:37:52 +000011260 /* Copy ASCII characters as-is */
11261 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011263 }
11264
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011266 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011267 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011268 (categories Z* and C* except ASCII space)
11269 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011271 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 if (ch <= 0xff) {
11273 PyUnicode_WRITE(okind, odata, o++, '\\');
11274 PyUnicode_WRITE(okind, odata, o++, 'x');
11275 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11276 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011277 }
11278 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 else if (ch >= 0x10000) {
11280 PyUnicode_WRITE(okind, odata, o++, '\\');
11281 PyUnicode_WRITE(okind, odata, o++, 'U');
11282 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11283 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11284 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11285 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11286 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11287 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11288 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11289 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011290 }
11291 /* Map 16-bit characters to '\uxxxx' */
11292 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 PyUnicode_WRITE(okind, odata, o++, '\\');
11294 PyUnicode_WRITE(okind, odata, o++, 'u');
11295 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11296 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11297 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11298 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011299 }
11300 }
11301 /* Copy characters as-is */
11302 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011304 }
11305 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011308 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309}
11310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011311PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313\n\
11314Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011315such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316arguments start and end are interpreted as in slice notation.\n\
11317\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
11320static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322{
Jesus Ceaac451502011-04-20 17:09:23 +020011323 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011324 Py_ssize_t start;
11325 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011326 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
Jesus Ceaac451502011-04-20 17:09:23 +020011328 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11329 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011330 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (PyUnicode_READY(self) == -1)
11333 return NULL;
11334 if (PyUnicode_READY(substring) == -1)
11335 return NULL;
11336
11337 result = any_find_slice(
11338 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11339 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011340 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
11342 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (result == -2)
11345 return NULL;
11346
Christian Heimes217cfd12007-12-02 14:31:20 +000011347 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348}
11349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011350PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Jesus Ceaac451502011-04-20 17:09:23 +020011358 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011359 Py_ssize_t start;
11360 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011361 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Jesus Ceaac451502011-04-20 17:09:23 +020011363 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11364 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011367 if (PyUnicode_READY(self) == -1)
11368 return NULL;
11369 if (PyUnicode_READY(substring) == -1)
11370 return NULL;
11371
11372 result = any_find_slice(
11373 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11374 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011375 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
11377 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (result == -2)
11380 return NULL;
11381
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 if (result < 0) {
11383 PyErr_SetString(PyExc_ValueError, "substring not found");
11384 return NULL;
11385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386
Christian Heimes217cfd12007-12-02 14:31:20 +000011387 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388}
11389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011390PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011393Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011394done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
11396static PyObject *
11397unicode_rjust(PyUnicodeObject *self, PyObject *args)
11398{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011399 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 Py_UCS4 fillchar = ' ';
11401
Victor Stinnere9a29352011-10-01 02:14:59 +020011402 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011404
Victor Stinnere9a29352011-10-01 02:14:59 +020011405 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011406 return NULL;
11407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409 Py_INCREF(self);
11410 return (PyObject*) self;
11411 }
11412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414}
11415
Alexander Belopolsky40018472011-02-26 01:02:56 +000011416PyObject *
11417PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418{
11419 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011420
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 s = PyUnicode_FromObject(s);
11422 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011423 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 if (sep != NULL) {
11425 sep = PyUnicode_FromObject(sep);
11426 if (sep == NULL) {
11427 Py_DECREF(s);
11428 return NULL;
11429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430 }
11431
11432 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11433
11434 Py_DECREF(s);
11435 Py_XDECREF(sep);
11436 return result;
11437}
11438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441\n\
11442Return a list of the words in S, using sep as the\n\
11443delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011444splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011445whitespace string is a separator and empty strings are\n\
11446removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
11448static PyObject*
11449unicode_split(PyUnicodeObject *self, PyObject *args)
11450{
11451 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011452 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Martin v. Löwis18e16552006-02-15 17:27:45 +000011454 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 return NULL;
11456
11457 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463}
11464
Thomas Wouters477c8d52006-05-27 19:21:47 +000011465PyObject *
11466PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11467{
11468 PyObject* str_obj;
11469 PyObject* sep_obj;
11470 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 int kind1, kind2, kind;
11472 void *buf1 = NULL, *buf2 = NULL;
11473 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011474
11475 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011476 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011478 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011480 Py_DECREF(str_obj);
11481 return NULL;
11482 }
11483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 kind1 = PyUnicode_KIND(str_in);
11485 kind2 = PyUnicode_KIND(sep_obj);
11486 kind = kind1 > kind2 ? kind1 : kind2;
11487 buf1 = PyUnicode_DATA(str_in);
11488 if (kind1 != kind)
11489 buf1 = _PyUnicode_AsKind(str_in, kind);
11490 if (!buf1)
11491 goto onError;
11492 buf2 = PyUnicode_DATA(sep_obj);
11493 if (kind2 != kind)
11494 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11495 if (!buf2)
11496 goto onError;
11497 len1 = PyUnicode_GET_LENGTH(str_obj);
11498 len2 = PyUnicode_GET_LENGTH(sep_obj);
11499
11500 switch(PyUnicode_KIND(str_in)) {
11501 case PyUnicode_1BYTE_KIND:
11502 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11503 break;
11504 case PyUnicode_2BYTE_KIND:
11505 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11506 break;
11507 case PyUnicode_4BYTE_KIND:
11508 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11509 break;
11510 default:
11511 assert(0);
11512 out = 0;
11513 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011514
11515 Py_DECREF(sep_obj);
11516 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (kind1 != kind)
11518 PyMem_Free(buf1);
11519 if (kind2 != kind)
11520 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011521
11522 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 onError:
11524 Py_DECREF(sep_obj);
11525 Py_DECREF(str_obj);
11526 if (kind1 != kind && buf1)
11527 PyMem_Free(buf1);
11528 if (kind2 != kind && buf2)
11529 PyMem_Free(buf2);
11530 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011531}
11532
11533
11534PyObject *
11535PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11536{
11537 PyObject* str_obj;
11538 PyObject* sep_obj;
11539 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 int kind1, kind2, kind;
11541 void *buf1 = NULL, *buf2 = NULL;
11542 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011543
11544 str_obj = PyUnicode_FromObject(str_in);
11545 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011547 sep_obj = PyUnicode_FromObject(sep_in);
11548 if (!sep_obj) {
11549 Py_DECREF(str_obj);
11550 return NULL;
11551 }
11552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 kind1 = PyUnicode_KIND(str_in);
11554 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011555 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 buf1 = PyUnicode_DATA(str_in);
11557 if (kind1 != kind)
11558 buf1 = _PyUnicode_AsKind(str_in, kind);
11559 if (!buf1)
11560 goto onError;
11561 buf2 = PyUnicode_DATA(sep_obj);
11562 if (kind2 != kind)
11563 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11564 if (!buf2)
11565 goto onError;
11566 len1 = PyUnicode_GET_LENGTH(str_obj);
11567 len2 = PyUnicode_GET_LENGTH(sep_obj);
11568
11569 switch(PyUnicode_KIND(str_in)) {
11570 case PyUnicode_1BYTE_KIND:
11571 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11572 break;
11573 case PyUnicode_2BYTE_KIND:
11574 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11575 break;
11576 case PyUnicode_4BYTE_KIND:
11577 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11578 break;
11579 default:
11580 assert(0);
11581 out = 0;
11582 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011583
11584 Py_DECREF(sep_obj);
11585 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 if (kind1 != kind)
11587 PyMem_Free(buf1);
11588 if (kind2 != kind)
11589 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011590
11591 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 onError:
11593 Py_DECREF(sep_obj);
11594 Py_DECREF(str_obj);
11595 if (kind1 != kind && buf1)
11596 PyMem_Free(buf1);
11597 if (kind2 != kind && buf2)
11598 PyMem_Free(buf2);
11599 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011600}
11601
11602PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011604\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011605Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011606the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011607found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011608
11609static PyObject*
11610unicode_partition(PyUnicodeObject *self, PyObject *separator)
11611{
11612 return PyUnicode_Partition((PyObject *)self, separator);
11613}
11614
11615PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011616 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011617\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011618Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011619the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011620separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621
11622static PyObject*
11623unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11624{
11625 return PyUnicode_RPartition((PyObject *)self, separator);
11626}
11627
Alexander Belopolsky40018472011-02-26 01:02:56 +000011628PyObject *
11629PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011630{
11631 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011632
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011633 s = PyUnicode_FromObject(s);
11634 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011635 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 if (sep != NULL) {
11637 sep = PyUnicode_FromObject(sep);
11638 if (sep == NULL) {
11639 Py_DECREF(s);
11640 return NULL;
11641 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011642 }
11643
11644 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11645
11646 Py_DECREF(s);
11647 Py_XDECREF(sep);
11648 return result;
11649}
11650
11651PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011653\n\
11654Return a list of the words in S, using sep as the\n\
11655delimiter string, starting at the end of the string and\n\
11656working to the front. If maxsplit is given, at most maxsplit\n\
11657splits are done. If sep is not specified, any whitespace string\n\
11658is a separator.");
11659
11660static PyObject*
11661unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11662{
11663 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011664 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011665
Martin v. Löwis18e16552006-02-15 17:27:45 +000011666 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011667 return NULL;
11668
11669 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011671 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011673 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011675}
11676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011677PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011678 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679\n\
11680Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011681Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011682is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
11684static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011685unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011687 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011688 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011690 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11691 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 return NULL;
11693
Guido van Rossum86662912000-04-11 15:38:46 +000011694 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695}
11696
11697static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011698PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699{
Walter Dörwald346737f2007-05-31 10:44:43 +000011700 if (PyUnicode_CheckExact(self)) {
11701 Py_INCREF(self);
11702 return self;
11703 } else
11704 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011705 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706}
11707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011708PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011709 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710\n\
11711Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011712and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
11714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011715unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 return fixup(self, fixswapcase);
11718}
11719
Georg Brandlceee0772007-11-27 23:48:05 +000011720PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011722\n\
11723Return a translation table usable for str.translate().\n\
11724If there is only one argument, it must be a dictionary mapping Unicode\n\
11725ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011726Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011727If there are two arguments, they must be strings of equal length, and\n\
11728in the resulting dictionary, each character in x will be mapped to the\n\
11729character at the same position in y. If there is a third argument, it\n\
11730must be a string, whose characters will be mapped to None in the result.");
11731
11732static PyObject*
11733unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11734{
11735 PyObject *x, *y = NULL, *z = NULL;
11736 PyObject *new = NULL, *key, *value;
11737 Py_ssize_t i = 0;
11738 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011739
Georg Brandlceee0772007-11-27 23:48:05 +000011740 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11741 return NULL;
11742 new = PyDict_New();
11743 if (!new)
11744 return NULL;
11745 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 int x_kind, y_kind, z_kind;
11747 void *x_data, *y_data, *z_data;
11748
Georg Brandlceee0772007-11-27 23:48:05 +000011749 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011750 if (!PyUnicode_Check(x)) {
11751 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11752 "be a string if there is a second argument");
11753 goto err;
11754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011756 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11757 "arguments must have equal length");
11758 goto err;
11759 }
11760 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 x_kind = PyUnicode_KIND(x);
11762 y_kind = PyUnicode_KIND(y);
11763 x_data = PyUnicode_DATA(x);
11764 y_data = PyUnicode_DATA(y);
11765 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11766 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11767 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011768 if (!key || !value)
11769 goto err;
11770 res = PyDict_SetItem(new, key, value);
11771 Py_DECREF(key);
11772 Py_DECREF(value);
11773 if (res < 0)
11774 goto err;
11775 }
11776 /* create entries for deleting chars in z */
11777 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 z_kind = PyUnicode_KIND(z);
11779 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011780 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011782 if (!key)
11783 goto err;
11784 res = PyDict_SetItem(new, key, Py_None);
11785 Py_DECREF(key);
11786 if (res < 0)
11787 goto err;
11788 }
11789 }
11790 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 int kind;
11792 void *data;
11793
Georg Brandlceee0772007-11-27 23:48:05 +000011794 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011795 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011796 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11797 "to maketrans it must be a dict");
11798 goto err;
11799 }
11800 /* copy entries into the new dict, converting string keys to int keys */
11801 while (PyDict_Next(x, &i, &key, &value)) {
11802 if (PyUnicode_Check(key)) {
11803 /* convert string keys to integer keys */
11804 PyObject *newkey;
11805 if (PyUnicode_GET_SIZE(key) != 1) {
11806 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11807 "table must be of length 1");
11808 goto err;
11809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 kind = PyUnicode_KIND(key);
11811 data = PyUnicode_DATA(key);
11812 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011813 if (!newkey)
11814 goto err;
11815 res = PyDict_SetItem(new, newkey, value);
11816 Py_DECREF(newkey);
11817 if (res < 0)
11818 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011819 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011820 /* just keep integer keys */
11821 if (PyDict_SetItem(new, key, value) < 0)
11822 goto err;
11823 } else {
11824 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11825 "be strings or integers");
11826 goto err;
11827 }
11828 }
11829 }
11830 return new;
11831 err:
11832 Py_DECREF(new);
11833 return NULL;
11834}
11835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838\n\
11839Return a copy of the string S, where all characters have been mapped\n\
11840through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011841Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011842Unmapped characters are left untouched. Characters mapped to None\n\
11843are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
11845static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849}
11850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011851PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011854Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
11856static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011857unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 return fixup(self, fixupper);
11860}
11861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011862PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011865Pad a numeric string S with zeros on the left, to fill a field\n\
11866of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867
11868static PyObject *
11869unicode_zfill(PyUnicodeObject *self, PyObject *args)
11870{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011871 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011873 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 int kind;
11875 void *data;
11876 Py_UCS4 chr;
11877
11878 if (PyUnicode_READY(self) == -1)
11879 return NULL;
11880
Martin v. Löwis18e16552006-02-15 17:27:45 +000011881 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 return NULL;
11883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011885 if (PyUnicode_CheckExact(self)) {
11886 Py_INCREF(self);
11887 return (PyObject*) self;
11888 }
11889 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011890 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 }
11892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894
11895 u = pad(self, fill, 0, '0');
11896
Walter Dörwald068325e2002-04-15 13:36:47 +000011897 if (u == NULL)
11898 return NULL;
11899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 kind = PyUnicode_KIND(u);
11901 data = PyUnicode_DATA(u);
11902 chr = PyUnicode_READ(kind, data, fill);
11903
11904 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 PyUnicode_WRITE(kind, data, 0, chr);
11907 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 }
11909
11910 return (PyObject*) u;
11911}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
11913#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011914static PyObject *
11915unicode__decimal2ascii(PyObject *self)
11916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011918}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919#endif
11920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011921PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011924Return True if S starts with the specified prefix, False otherwise.\n\
11925With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011926With optional end, stop comparing S at that position.\n\
11927prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
11929static PyObject *
11930unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011933 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011935 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011936 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011937 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938
Jesus Ceaac451502011-04-20 17:09:23 +020011939 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011941 if (PyTuple_Check(subobj)) {
11942 Py_ssize_t i;
11943 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11944 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011946 if (substring == NULL)
11947 return NULL;
11948 result = tailmatch(self, substring, start, end, -1);
11949 Py_DECREF(substring);
11950 if (result) {
11951 Py_RETURN_TRUE;
11952 }
11953 }
11954 /* nothing matched */
11955 Py_RETURN_FALSE;
11956 }
11957 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011958 if (substring == NULL) {
11959 if (PyErr_ExceptionMatches(PyExc_TypeError))
11960 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11961 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011963 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011964 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011966 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967}
11968
11969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011970PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011973Return True if S ends with the specified suffix, False otherwise.\n\
11974With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011975With optional end, stop comparing S at that position.\n\
11976suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
11978static PyObject *
11979unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011980 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011982 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011984 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011985 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011986 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
Jesus Ceaac451502011-04-20 17:09:23 +020011988 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011990 if (PyTuple_Check(subobj)) {
11991 Py_ssize_t i;
11992 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11993 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011995 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011997 result = tailmatch(self, substring, start, end, +1);
11998 Py_DECREF(substring);
11999 if (result) {
12000 Py_RETURN_TRUE;
12001 }
12002 }
12003 Py_RETURN_FALSE;
12004 }
12005 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012006 if (substring == NULL) {
12007 if (PyErr_ExceptionMatches(PyExc_TypeError))
12008 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12009 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012011 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012012 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012014 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015}
12016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012018
12019PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012020 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012021\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012022Return a formatted version of S, using substitutions from args and kwargs.\n\
12023The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012024
Eric Smith27bbca62010-11-04 17:06:58 +000012025PyDoc_STRVAR(format_map__doc__,
12026 "S.format_map(mapping) -> str\n\
12027\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012028Return a formatted version of S, using substitutions from mapping.\n\
12029The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012030
Eric Smith4a7d76d2008-05-30 18:10:19 +000012031static PyObject *
12032unicode__format__(PyObject* self, PyObject* args)
12033{
12034 PyObject *format_spec;
12035
12036 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12037 return NULL;
12038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12040 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012041}
12042
Eric Smith8c663262007-08-25 02:26:07 +000012043PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012045\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012046Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012047
12048static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012049unicode__sizeof__(PyUnicodeObject *v)
12050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 Py_ssize_t size;
12052
12053 /* If it's a compact object, account for base structure +
12054 character data. */
12055 if (PyUnicode_IS_COMPACT_ASCII(v))
12056 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12057 else if (PyUnicode_IS_COMPACT(v))
12058 size = sizeof(PyCompactUnicodeObject) +
12059 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12060 else {
12061 /* If it is a two-block object, account for base object, and
12062 for character block if present. */
12063 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012064 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 size += (PyUnicode_GET_LENGTH(v) + 1) *
12066 PyUnicode_CHARACTER_SIZE(v);
12067 }
12068 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012069 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012070 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012072 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012073 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074
12075 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012076}
12077
12078PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012080
12081static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012082unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012083{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012084 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 if (!copy)
12086 return NULL;
12087 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012088}
12089
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090static PyMethodDef unicode_methods[] = {
12091
12092 /* Order is according to common usage: often used methods should
12093 appear first, since lookup is done sequentially. */
12094
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012095 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012096 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12097 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012098 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012099 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12100 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12101 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12102 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12103 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12104 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12105 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012106 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012107 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12108 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12109 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012111 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12112 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12113 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012115 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012116 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012117 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012118 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12119 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12120 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12121 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12122 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12123 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12124 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12125 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12126 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12127 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12128 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12129 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12130 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12131 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012132 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012133 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012134 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012135 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012136 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012137 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012138 {"maketrans", (PyCFunction) unicode_maketrans,
12139 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012140 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012141#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012142 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143#endif
12144
12145#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012146 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012147 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148#endif
12149
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 {NULL, NULL}
12152};
12153
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012154static PyObject *
12155unicode_mod(PyObject *v, PyObject *w)
12156{
Brian Curtindfc80e32011-08-10 20:28:54 -050012157 if (!PyUnicode_Check(v))
12158 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012160}
12161
12162static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012163 0, /*nb_add*/
12164 0, /*nb_subtract*/
12165 0, /*nb_multiply*/
12166 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012167};
12168
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 (lenfunc) unicode_length, /* sq_length */
12171 PyUnicode_Concat, /* sq_concat */
12172 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12173 (ssizeargfunc) unicode_getitem, /* sq_item */
12174 0, /* sq_slice */
12175 0, /* sq_ass_item */
12176 0, /* sq_ass_slice */
12177 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178};
12179
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012180static PyObject*
12181unicode_subscript(PyUnicodeObject* self, PyObject* item)
12182{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (PyUnicode_READY(self) == -1)
12184 return NULL;
12185
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012186 if (PyIndex_Check(item)) {
12187 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012188 if (i == -1 && PyErr_Occurred())
12189 return NULL;
12190 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012192 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012193 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012194 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012196 Py_UNICODE* result_buf;
12197 PyObject* result;
12198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012201 return NULL;
12202 }
12203
12204 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 return PyUnicode_New(0, 0);
12206 } else if (start == 0 && step == 1 &&
12207 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012208 PyUnicode_CheckExact(self)) {
12209 Py_INCREF(self);
12210 return (PyObject *)self;
12211 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012212 return PyUnicode_Substring((PyObject*)self,
12213 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012214 } else {
12215 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012216 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12217 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012218
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 if (result_buf == NULL)
12220 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012221
12222 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12223 result_buf[i] = source_buf[cur];
12224 }
Tim Petersced69f82003-09-16 20:30:58 +000012225
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012226 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012227 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012228 return result;
12229 }
12230 } else {
12231 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12232 return NULL;
12233 }
12234}
12235
12236static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012237 (lenfunc)unicode_length, /* mp_length */
12238 (binaryfunc)unicode_subscript, /* mp_subscript */
12239 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012240};
12241
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243/* Helpers for PyUnicode_Format() */
12244
12245static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012246getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012248 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 (*p_argidx)++;
12251 if (arglen < 0)
12252 return args;
12253 else
12254 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 }
12256 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 return NULL;
12259}
12260
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012261/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012263static PyObject *
12264formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012266 char *p;
12267 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012269
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 x = PyFloat_AsDouble(v);
12271 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012272 return NULL;
12273
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012276
Eric Smith0923d1d2009-04-16 20:16:10 +000012277 p = PyOS_double_to_string(x, type, prec,
12278 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012279 if (p == NULL)
12280 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012282 PyMem_Free(p);
12283 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284}
12285
Tim Peters38fd5b62000-09-21 05:43:11 +000012286static PyObject*
12287formatlong(PyObject *val, int flags, int prec, int type)
12288{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012289 char *buf;
12290 int len;
12291 PyObject *str; /* temporary string object. */
12292 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012293
Benjamin Peterson14339b62009-01-31 16:36:08 +000012294 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12295 if (!str)
12296 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012298 Py_DECREF(str);
12299 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012300}
12301
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012304 size_t buflen,
12305 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012307 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012308 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (PyUnicode_GET_LENGTH(v) == 1) {
12310 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 buf[1] = '\0';
12312 return 1;
12313 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 goto onError;
12315 }
12316 else {
12317 /* Integer input truncated to a character */
12318 long x;
12319 x = PyLong_AsLong(v);
12320 if (x == -1 && PyErr_Occurred())
12321 goto onError;
12322
12323 if (x < 0 || x > 0x10ffff) {
12324 PyErr_SetString(PyExc_OverflowError,
12325 "%c arg not in range(0x110000)");
12326 return -1;
12327 }
12328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 buf[1] = '\0';
12331 return 1;
12332 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012333
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012335 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012337 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338}
12339
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012340/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012341 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012342*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012343#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012344
Alexander Belopolsky40018472011-02-26 01:02:56 +000012345PyObject *
12346PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 void *fmt;
12349 int fmtkind;
12350 PyObject *result;
12351 Py_UCS4 *res, *res0;
12352 Py_UCS4 max;
12353 int kind;
12354 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012358
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 PyErr_BadInternalCall();
12361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12364 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 fmt = PyUnicode_DATA(uformat);
12367 fmtkind = PyUnicode_KIND(uformat);
12368 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12369 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370
12371 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12373 if (res0 == NULL) {
12374 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
12378 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 arglen = PyTuple_Size(args);
12380 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 }
12382 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 arglen = -1;
12384 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012386 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012387 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389
12390 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 if (--rescnt < 0) {
12393 rescnt = fmtcnt + 100;
12394 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12396 if (res0 == NULL){
12397 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 }
12400 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 }
12405 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 /* Got a format specifier */
12407 int flags = 0;
12408 Py_ssize_t width = -1;
12409 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 Py_UCS4 c = '\0';
12411 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 int isnumok;
12413 PyObject *v = NULL;
12414 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 void *pbuf;
12416 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012417 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 Py_ssize_t len, len1;
12419 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 fmtpos++;
12422 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12423 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 Py_ssize_t keylen;
12425 PyObject *key;
12426 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012427
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 if (dict == NULL) {
12429 PyErr_SetString(PyExc_TypeError,
12430 "format requires a mapping");
12431 goto onError;
12432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 /* Skip over balanced parentheses */
12437 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012439 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 if (fmtcnt < 0 || pcount > 0) {
12446 PyErr_SetString(PyExc_ValueError,
12447 "incomplete format key");
12448 goto onError;
12449 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012450 key = PyUnicode_Substring((PyObject*)uformat,
12451 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 if (key == NULL)
12453 goto onError;
12454 if (args_owned) {
12455 Py_DECREF(args);
12456 args_owned = 0;
12457 }
12458 args = PyObject_GetItem(dict, key);
12459 Py_DECREF(key);
12460 if (args == NULL) {
12461 goto onError;
12462 }
12463 args_owned = 1;
12464 arglen = -1;
12465 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012467 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 case '-': flags |= F_LJUST; continue;
12470 case '+': flags |= F_SIGN; continue;
12471 case ' ': flags |= F_BLANK; continue;
12472 case '#': flags |= F_ALT; continue;
12473 case '0': flags |= F_ZERO; continue;
12474 }
12475 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012476 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 if (c == '*') {
12478 v = getnextarg(args, arglen, &argidx);
12479 if (v == NULL)
12480 goto onError;
12481 if (!PyLong_Check(v)) {
12482 PyErr_SetString(PyExc_TypeError,
12483 "* wants int");
12484 goto onError;
12485 }
12486 width = PyLong_AsLong(v);
12487 if (width == -1 && PyErr_Occurred())
12488 goto onError;
12489 if (width < 0) {
12490 flags |= F_LJUST;
12491 width = -width;
12492 }
12493 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 }
12496 else if (c >= '0' && c <= '9') {
12497 width = c - '0';
12498 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 if (c < '0' || c > '9')
12501 break;
12502 if ((width*10) / 10 != width) {
12503 PyErr_SetString(PyExc_ValueError,
12504 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012505 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 }
12507 width = width*10 + (c - '0');
12508 }
12509 }
12510 if (c == '.') {
12511 prec = 0;
12512 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 if (c == '*') {
12515 v = getnextarg(args, arglen, &argidx);
12516 if (v == NULL)
12517 goto onError;
12518 if (!PyLong_Check(v)) {
12519 PyErr_SetString(PyExc_TypeError,
12520 "* wants int");
12521 goto onError;
12522 }
12523 prec = PyLong_AsLong(v);
12524 if (prec == -1 && PyErr_Occurred())
12525 goto onError;
12526 if (prec < 0)
12527 prec = 0;
12528 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 }
12531 else if (c >= '0' && c <= '9') {
12532 prec = c - '0';
12533 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 if (c < '0' || c > '9')
12536 break;
12537 if ((prec*10) / 10 != prec) {
12538 PyErr_SetString(PyExc_ValueError,
12539 "prec too big");
12540 goto onError;
12541 }
12542 prec = prec*10 + (c - '0');
12543 }
12544 }
12545 } /* prec */
12546 if (fmtcnt >= 0) {
12547 if (c == 'h' || c == 'l' || c == 'L') {
12548 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 }
12551 }
12552 if (fmtcnt < 0) {
12553 PyErr_SetString(PyExc_ValueError,
12554 "incomplete format");
12555 goto onError;
12556 }
12557 if (c != '%') {
12558 v = getnextarg(args, arglen, &argidx);
12559 if (v == NULL)
12560 goto onError;
12561 }
12562 sign = 0;
12563 fill = ' ';
12564 switch (c) {
12565
12566 case '%':
12567 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 len = 1;
12572 break;
12573
12574 case 's':
12575 case 'r':
12576 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012577 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 temp = v;
12579 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012580 }
12581 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 if (c == 's')
12583 temp = PyObject_Str(v);
12584 else if (c == 'r')
12585 temp = PyObject_Repr(v);
12586 else
12587 temp = PyObject_ASCII(v);
12588 if (temp == NULL)
12589 goto onError;
12590 if (PyUnicode_Check(temp))
12591 /* nothing to do */;
12592 else {
12593 Py_DECREF(temp);
12594 PyErr_SetString(PyExc_TypeError,
12595 "%s argument has non-string str()");
12596 goto onError;
12597 }
12598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 if (PyUnicode_READY(temp) == -1) {
12600 Py_CLEAR(temp);
12601 goto onError;
12602 }
12603 pbuf = PyUnicode_DATA(temp);
12604 kind = PyUnicode_KIND(temp);
12605 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 if (prec >= 0 && len > prec)
12607 len = prec;
12608 break;
12609
12610 case 'i':
12611 case 'd':
12612 case 'u':
12613 case 'o':
12614 case 'x':
12615 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012616 isnumok = 0;
12617 if (PyNumber_Check(v)) {
12618 PyObject *iobj=NULL;
12619
12620 if (PyLong_Check(v)) {
12621 iobj = v;
12622 Py_INCREF(iobj);
12623 }
12624 else {
12625 iobj = PyNumber_Long(v);
12626 }
12627 if (iobj!=NULL) {
12628 if (PyLong_Check(iobj)) {
12629 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012630 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012631 Py_DECREF(iobj);
12632 if (!temp)
12633 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 if (PyUnicode_READY(temp) == -1) {
12635 Py_CLEAR(temp);
12636 goto onError;
12637 }
12638 pbuf = PyUnicode_DATA(temp);
12639 kind = PyUnicode_KIND(temp);
12640 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 sign = 1;
12642 }
12643 else {
12644 Py_DECREF(iobj);
12645 }
12646 }
12647 }
12648 if (!isnumok) {
12649 PyErr_Format(PyExc_TypeError,
12650 "%%%c format: a number is required, "
12651 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12652 goto onError;
12653 }
12654 if (flags & F_ZERO)
12655 fill = '0';
12656 break;
12657
12658 case 'e':
12659 case 'E':
12660 case 'f':
12661 case 'F':
12662 case 'g':
12663 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012664 temp = formatfloat(v, flags, prec, c);
12665 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 if (PyUnicode_READY(temp) == -1) {
12668 Py_CLEAR(temp);
12669 goto onError;
12670 }
12671 pbuf = PyUnicode_DATA(temp);
12672 kind = PyUnicode_KIND(temp);
12673 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 sign = 1;
12675 if (flags & F_ZERO)
12676 fill = '0';
12677 break;
12678
12679 case 'c':
12680 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012682 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 if (len < 0)
12684 goto onError;
12685 break;
12686
12687 default:
12688 PyErr_Format(PyExc_ValueError,
12689 "unsupported format character '%c' (0x%x) "
12690 "at index %zd",
12691 (31<=c && c<=126) ? (char)c : '?',
12692 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 goto onError;
12695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 /* pbuf is initialized here. */
12697 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12700 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12701 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 len--;
12703 }
12704 else if (flags & F_SIGN)
12705 sign = '+';
12706 else if (flags & F_BLANK)
12707 sign = ' ';
12708 else
12709 sign = 0;
12710 }
12711 if (width < len)
12712 width = len;
12713 if (rescnt - (sign != 0) < width) {
12714 reslen -= rescnt;
12715 rescnt = width + fmtcnt + 100;
12716 reslen += rescnt;
12717 if (reslen < 0) {
12718 Py_XDECREF(temp);
12719 PyErr_NoMemory();
12720 goto onError;
12721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12723 if (res0 == 0) {
12724 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 Py_XDECREF(temp);
12726 goto onError;
12727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 }
12730 if (sign) {
12731 if (fill != ' ')
12732 *res++ = sign;
12733 rescnt--;
12734 if (width > len)
12735 width--;
12736 }
12737 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12739 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12742 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 }
12744 rescnt -= 2;
12745 width -= 2;
12746 if (width < 0)
12747 width = 0;
12748 len -= 2;
12749 }
12750 if (width > len && !(flags & F_LJUST)) {
12751 do {
12752 --rescnt;
12753 *res++ = fill;
12754 } while (--width > len);
12755 }
12756 if (fill == ' ') {
12757 if (sign)
12758 *res++ = sign;
12759 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12761 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12762 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12763 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012764 }
12765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 /* Copy all characters, preserving len */
12767 len1 = len;
12768 while (len1--) {
12769 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12770 rescnt--;
12771 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 while (--width >= len) {
12773 --rescnt;
12774 *res++ = ' ';
12775 }
12776 if (dict && (argidx < arglen) && c != '%') {
12777 PyErr_SetString(PyExc_TypeError,
12778 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012779 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 goto onError;
12781 }
12782 Py_XDECREF(temp);
12783 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784 } /* until end */
12785 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 PyErr_SetString(PyExc_TypeError,
12787 "not all arguments converted during string formatting");
12788 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012789 }
12790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791
12792 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12793 if (*res > max)
12794 max = *res;
12795 result = PyUnicode_New(reslen - rescnt, max);
12796 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 kind = PyUnicode_KIND(result);
12799 for (res = res0; res < res0+reslen-rescnt; res++)
12800 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12801 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804 }
12805 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806 return (PyObject *)result;
12807
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810 Py_DECREF(uformat);
12811 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813 }
12814 return NULL;
12815}
12816
Jeremy Hylton938ace62002-07-17 16:30:39 +000012817static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012818unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12819
Tim Peters6d6c1a32001-08-02 04:15:00 +000012820static PyObject *
12821unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12822{
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824 static char *kwlist[] = {"object", "encoding", "errors", 0};
12825 char *encoding = NULL;
12826 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012827
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 if (type != &PyUnicode_Type)
12829 return unicode_subtype_new(type, args, kwds);
12830 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012832 return NULL;
12833 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012835 if (encoding == NULL && errors == NULL)
12836 return PyObject_Str(x);
12837 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012839}
12840
Guido van Rossume023fe02001-08-30 03:12:59 +000012841static PyObject *
12842unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12843{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012844 PyUnicodeObject *unicode, *self;
12845 Py_ssize_t length, char_size;
12846 int share_wstr, share_utf8;
12847 unsigned int kind;
12848 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012849
Benjamin Peterson14339b62009-01-31 16:36:08 +000012850 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012851
12852 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12853 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012855 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012856 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012857 return NULL;
12858
12859 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12860 if (self == NULL) {
12861 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 return NULL;
12863 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012864 kind = PyUnicode_KIND(unicode);
12865 length = PyUnicode_GET_LENGTH(unicode);
12866
12867 _PyUnicode_LENGTH(self) = length;
12868 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12869 _PyUnicode_STATE(self).interned = 0;
12870 _PyUnicode_STATE(self).kind = kind;
12871 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012872 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012873 _PyUnicode_STATE(self).ready = 1;
12874 _PyUnicode_WSTR(self) = NULL;
12875 _PyUnicode_UTF8_LENGTH(self) = 0;
12876 _PyUnicode_UTF8(self) = NULL;
12877 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012878 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012879
12880 share_utf8 = 0;
12881 share_wstr = 0;
12882 if (kind == PyUnicode_1BYTE_KIND) {
12883 char_size = 1;
12884 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12885 share_utf8 = 1;
12886 }
12887 else if (kind == PyUnicode_2BYTE_KIND) {
12888 char_size = 2;
12889 if (sizeof(wchar_t) == 2)
12890 share_wstr = 1;
12891 }
12892 else {
12893 assert(kind == PyUnicode_4BYTE_KIND);
12894 char_size = 4;
12895 if (sizeof(wchar_t) == 4)
12896 share_wstr = 1;
12897 }
12898
12899 /* Ensure we won't overflow the length. */
12900 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12901 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012903 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012904 data = PyObject_MALLOC((length + 1) * char_size);
12905 if (data == NULL) {
12906 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 goto onError;
12908 }
12909
Victor Stinnerc3c74152011-10-02 20:39:55 +020012910 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012911 if (share_utf8) {
12912 _PyUnicode_UTF8_LENGTH(self) = length;
12913 _PyUnicode_UTF8(self) = data;
12914 }
12915 if (share_wstr) {
12916 _PyUnicode_WSTR_LENGTH(self) = length;
12917 _PyUnicode_WSTR(self) = (wchar_t *)data;
12918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012920 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12921 PyUnicode_KIND_SIZE(kind, length + 1));
12922 Py_DECREF(unicode);
12923 return (PyObject *)self;
12924
12925onError:
12926 Py_DECREF(unicode);
12927 Py_DECREF(self);
12928 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012929}
12930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012931PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012933\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012934Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012935encoding defaults to the current default string encoding.\n\
12936errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012937
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012938static PyObject *unicode_iter(PyObject *seq);
12939
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012941 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012942 "str", /* tp_name */
12943 sizeof(PyUnicodeObject), /* tp_size */
12944 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012946 (destructor)unicode_dealloc, /* tp_dealloc */
12947 0, /* tp_print */
12948 0, /* tp_getattr */
12949 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012950 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012951 unicode_repr, /* tp_repr */
12952 &unicode_as_number, /* tp_as_number */
12953 &unicode_as_sequence, /* tp_as_sequence */
12954 &unicode_as_mapping, /* tp_as_mapping */
12955 (hashfunc) unicode_hash, /* tp_hash*/
12956 0, /* tp_call*/
12957 (reprfunc) unicode_str, /* tp_str */
12958 PyObject_GenericGetAttr, /* tp_getattro */
12959 0, /* tp_setattro */
12960 0, /* tp_as_buffer */
12961 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012962 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012963 unicode_doc, /* tp_doc */
12964 0, /* tp_traverse */
12965 0, /* tp_clear */
12966 PyUnicode_RichCompare, /* tp_richcompare */
12967 0, /* tp_weaklistoffset */
12968 unicode_iter, /* tp_iter */
12969 0, /* tp_iternext */
12970 unicode_methods, /* tp_methods */
12971 0, /* tp_members */
12972 0, /* tp_getset */
12973 &PyBaseObject_Type, /* tp_base */
12974 0, /* tp_dict */
12975 0, /* tp_descr_get */
12976 0, /* tp_descr_set */
12977 0, /* tp_dictoffset */
12978 0, /* tp_init */
12979 0, /* tp_alloc */
12980 unicode_new, /* tp_new */
12981 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982};
12983
12984/* Initialize the Unicode implementation */
12985
Thomas Wouters78890102000-07-22 19:25:51 +000012986void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012988 int i;
12989
Thomas Wouters477c8d52006-05-27 19:21:47 +000012990 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992 0x000A, /* LINE FEED */
12993 0x000D, /* CARRIAGE RETURN */
12994 0x001C, /* FILE SEPARATOR */
12995 0x001D, /* GROUP SEPARATOR */
12996 0x001E, /* RECORD SEPARATOR */
12997 0x0085, /* NEXT LINE */
12998 0x2028, /* LINE SEPARATOR */
12999 0x2029, /* PARAGRAPH SEPARATOR */
13000 };
13001
Fred Drakee4315f52000-05-09 19:53:39 +000013002 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013003 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013004 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013006
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013007 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013009 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013010 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013011
13012 /* initialize the linebreak bloom filter */
13013 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013015 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013016
13017 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018}
13019
13020/* Finalize the Unicode implementation */
13021
Christian Heimesa156e092008-02-16 07:38:31 +000013022int
13023PyUnicode_ClearFreeList(void)
13024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013026}
13027
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028void
Thomas Wouters78890102000-07-22 19:25:51 +000013029_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013030{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013031 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013033 Py_XDECREF(unicode_empty);
13034 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013035
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013036 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 if (unicode_latin1[i]) {
13038 Py_DECREF(unicode_latin1[i]);
13039 unicode_latin1[i] = NULL;
13040 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013041 }
Christian Heimesa156e092008-02-16 07:38:31 +000013042 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013043}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013044
Walter Dörwald16807132007-05-25 13:52:07 +000013045void
13046PyUnicode_InternInPlace(PyObject **p)
13047{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013048 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13049 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013050#ifdef Py_DEBUG
13051 assert(s != NULL);
13052 assert(_PyUnicode_CHECK(s));
13053#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013054 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013055 return;
13056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013057 /* If it's a subclass, we don't really know what putting
13058 it in the interned dict might do. */
13059 if (!PyUnicode_CheckExact(s))
13060 return;
13061 if (PyUnicode_CHECK_INTERNED(s))
13062 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013063 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013064 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 return;
13066 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013067 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 if (interned == NULL) {
13069 interned = PyDict_New();
13070 if (interned == NULL) {
13071 PyErr_Clear(); /* Don't leave an exception */
13072 return;
13073 }
13074 }
13075 /* It might be that the GetItem call fails even
13076 though the key is present in the dictionary,
13077 namely when this happens during a stack overflow. */
13078 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013080 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013081
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 if (t) {
13083 Py_INCREF(t);
13084 Py_DECREF(*p);
13085 *p = t;
13086 return;
13087 }
Walter Dörwald16807132007-05-25 13:52:07 +000013088
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 PyThreadState_GET()->recursion_critical = 1;
13090 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13091 PyErr_Clear();
13092 PyThreadState_GET()->recursion_critical = 0;
13093 return;
13094 }
13095 PyThreadState_GET()->recursion_critical = 0;
13096 /* The two references in interned are not counted by refcnt.
13097 The deallocator will take care of this */
13098 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013100}
13101
13102void
13103PyUnicode_InternImmortal(PyObject **p)
13104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13106
Benjamin Peterson14339b62009-01-31 16:36:08 +000013107 PyUnicode_InternInPlace(p);
13108 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013110 Py_INCREF(*p);
13111 }
Walter Dörwald16807132007-05-25 13:52:07 +000013112}
13113
13114PyObject *
13115PyUnicode_InternFromString(const char *cp)
13116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 PyObject *s = PyUnicode_FromString(cp);
13118 if (s == NULL)
13119 return NULL;
13120 PyUnicode_InternInPlace(&s);
13121 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013122}
13123
Alexander Belopolsky40018472011-02-26 01:02:56 +000013124void
13125_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013127 PyObject *keys;
13128 PyUnicodeObject *s;
13129 Py_ssize_t i, n;
13130 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013131
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 if (interned == NULL || !PyDict_Check(interned))
13133 return;
13134 keys = PyDict_Keys(interned);
13135 if (keys == NULL || !PyList_Check(keys)) {
13136 PyErr_Clear();
13137 return;
13138 }
Walter Dörwald16807132007-05-25 13:52:07 +000013139
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13141 detector, interned unicode strings are not forcibly deallocated;
13142 rather, we give them their stolen references back, and then clear
13143 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013144
Benjamin Peterson14339b62009-01-31 16:36:08 +000013145 n = PyList_GET_SIZE(keys);
13146 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 for (i = 0; i < n; i++) {
13149 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 if (PyUnicode_READY(s) == -1)
13151 fprintf(stderr, "could not ready string\n");
13152 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013153 case SSTATE_NOT_INTERNED:
13154 /* XXX Shouldn't happen */
13155 break;
13156 case SSTATE_INTERNED_IMMORTAL:
13157 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 break;
13160 case SSTATE_INTERNED_MORTAL:
13161 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 break;
13164 default:
13165 Py_FatalError("Inconsistent interned string state.");
13166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 }
13169 fprintf(stderr, "total size of all interned strings: "
13170 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13171 "mortal/immortal\n", mortal_size, immortal_size);
13172 Py_DECREF(keys);
13173 PyDict_Clear(interned);
13174 Py_DECREF(interned);
13175 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013176}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013177
13178
13179/********************* Unicode Iterator **************************/
13180
13181typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013182 PyObject_HEAD
13183 Py_ssize_t it_index;
13184 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013185} unicodeiterobject;
13186
13187static void
13188unicodeiter_dealloc(unicodeiterobject *it)
13189{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013190 _PyObject_GC_UNTRACK(it);
13191 Py_XDECREF(it->it_seq);
13192 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013193}
13194
13195static int
13196unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13197{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 Py_VISIT(it->it_seq);
13199 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013200}
13201
13202static PyObject *
13203unicodeiter_next(unicodeiterobject *it)
13204{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013205 PyUnicodeObject *seq;
13206 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013207
Benjamin Peterson14339b62009-01-31 16:36:08 +000013208 assert(it != NULL);
13209 seq = it->it_seq;
13210 if (seq == NULL)
13211 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013212 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13215 int kind = PyUnicode_KIND(seq);
13216 void *data = PyUnicode_DATA(seq);
13217 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13218 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 if (item != NULL)
13220 ++it->it_index;
13221 return item;
13222 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013223
Benjamin Peterson14339b62009-01-31 16:36:08 +000013224 Py_DECREF(seq);
13225 it->it_seq = NULL;
13226 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013227}
13228
13229static PyObject *
13230unicodeiter_len(unicodeiterobject *it)
13231{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013232 Py_ssize_t len = 0;
13233 if (it->it_seq)
13234 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13235 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013236}
13237
13238PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13239
13240static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013241 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013243 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013244};
13245
13246PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013247 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13248 "str_iterator", /* tp_name */
13249 sizeof(unicodeiterobject), /* tp_basicsize */
13250 0, /* tp_itemsize */
13251 /* methods */
13252 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13253 0, /* tp_print */
13254 0, /* tp_getattr */
13255 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013256 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013257 0, /* tp_repr */
13258 0, /* tp_as_number */
13259 0, /* tp_as_sequence */
13260 0, /* tp_as_mapping */
13261 0, /* tp_hash */
13262 0, /* tp_call */
13263 0, /* tp_str */
13264 PyObject_GenericGetAttr, /* tp_getattro */
13265 0, /* tp_setattro */
13266 0, /* tp_as_buffer */
13267 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13268 0, /* tp_doc */
13269 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13270 0, /* tp_clear */
13271 0, /* tp_richcompare */
13272 0, /* tp_weaklistoffset */
13273 PyObject_SelfIter, /* tp_iter */
13274 (iternextfunc)unicodeiter_next, /* tp_iternext */
13275 unicodeiter_methods, /* tp_methods */
13276 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013277};
13278
13279static PyObject *
13280unicode_iter(PyObject *seq)
13281{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013282 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013283
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 if (!PyUnicode_Check(seq)) {
13285 PyErr_BadInternalCall();
13286 return NULL;
13287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 if (PyUnicode_READY(seq) == -1)
13289 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013290 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13291 if (it == NULL)
13292 return NULL;
13293 it->it_index = 0;
13294 Py_INCREF(seq);
13295 it->it_seq = (PyUnicodeObject *)seq;
13296 _PyObject_GC_TRACK(it);
13297 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013298}
13299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013300#define UNIOP(x) Py_UNICODE_##x
13301#define UNIOP_t Py_UNICODE
13302#include "uniops.h"
13303#undef UNIOP
13304#undef UNIOP_t
13305#define UNIOP(x) Py_UCS4_##x
13306#define UNIOP_t Py_UCS4
13307#include "uniops.h"
13308#undef UNIOP
13309#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013310
Victor Stinner71133ff2010-09-01 23:43:53 +000013311Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013312PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013313{
13314 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13315 Py_UNICODE *copy;
13316 Py_ssize_t size;
13317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 if (!PyUnicode_Check(unicode)) {
13319 PyErr_BadArgument();
13320 return NULL;
13321 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013322 /* Ensure we won't overflow the size. */
13323 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13324 PyErr_NoMemory();
13325 return NULL;
13326 }
13327 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13328 size *= sizeof(Py_UNICODE);
13329 copy = PyMem_Malloc(size);
13330 if (copy == NULL) {
13331 PyErr_NoMemory();
13332 return NULL;
13333 }
13334 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13335 return copy;
13336}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013337
Georg Brandl66c221e2010-10-14 07:04:07 +000013338/* A _string module, to export formatter_parser and formatter_field_name_split
13339 to the string.Formatter class implemented in Python. */
13340
13341static PyMethodDef _string_methods[] = {
13342 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13343 METH_O, PyDoc_STR("split the argument as a field name")},
13344 {"formatter_parser", (PyCFunction) formatter_parser,
13345 METH_O, PyDoc_STR("parse the argument as a format string")},
13346 {NULL, NULL}
13347};
13348
13349static struct PyModuleDef _string_module = {
13350 PyModuleDef_HEAD_INIT,
13351 "_string",
13352 PyDoc_STR("string helper module"),
13353 0,
13354 _string_methods,
13355 NULL,
13356 NULL,
13357 NULL,
13358 NULL
13359};
13360
13361PyMODINIT_FUNC
13362PyInit__string(void)
13363{
13364 return PyModule_Create(&_string_module);
13365}
13366
13367
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013368#ifdef __cplusplus
13369}
13370#endif