blob: 6aebdc0cff20a06d4053f740c0bfe622b32f33dc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200133#define _PyUnicode_READY_REPLACE(p_obj) \
134 (assert(_PyUnicode_CHECK(*p_obj)), \
135 (PyUnicode_IS_READY(*p_obj) ? \
136 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
Victor Stinnerc379ead2011-10-03 12:52:27 +0200138#define _PyUnicode_SHARE_UTF8(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
141 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op) \
143 (assert(_PyUnicode_CHECK(op)), \
144 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
Victor Stinner829c0ad2011-10-03 01:08:02 +0200146/* true if the Unicode object has an allocated UTF-8 memory block
147 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200148#define _PyUnicode_HAS_UTF8_MEMORY(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (!PyUnicode_IS_COMPACT_ASCII(op) \
151 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
Victor Stinner910337b2011-10-03 03:20:16 +0200154/* Generic helper macro to convert characters of different types.
155 from_type and to_type have to be valid type names, begin and end
156 are pointers to the source characters which should be of type
157 "from_type *". to is a pointer of type "to_type *" and points to the
158 buffer where the result characters are written to. */
159#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
160 do { \
161 const from_type *iter_; to_type *to_; \
162 for (iter_ = (begin), to_ = (to_type *)(to); \
163 iter_ < (end); \
164 ++iter_, ++to_) { \
165 *to_ = (to_type)*iter_; \
166 } \
167 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200169/* The Unicode string has been modified: reset the hash */
170#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
171
Walter Dörwald16807132007-05-25 13:52:07 +0000172/* This dictionary holds all interned unicode strings. Note that references
173 to strings in this dictionary are *not* counted in the string's ob_refcnt.
174 When the interned string reaches a refcnt of 0 the string deallocation
175 function will delete the reference from this dictionary.
176
177 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000178 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000179*/
180static PyObject *interned;
181
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000182/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200183static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184
185/* Single character Unicode strings in the Latin-1 range are being
186 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200187static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000188
Christian Heimes190d79e2008-01-30 11:58:22 +0000189/* Fast detection of the most frequent whitespace characters */
190const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000192/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000193/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000C: * FORM FEED */
196/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 1, 1, 1, 1, 1, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x001C: * FILE SEPARATOR */
200/* case 0x001D: * GROUP SEPARATOR */
201/* case 0x001E: * RECORD SEPARATOR */
202/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000203 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 1, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000209
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000218};
219
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200220/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200221static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223
Alexander Belopolsky40018472011-02-26 01:02:56 +0000224static PyObject *
225unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000226 PyObject **errorHandler,const char *encoding, const char *reason,
227 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
228 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static void
231raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300232 const char *encoding,
233 const Py_UNICODE *unicode, Py_ssize_t size,
234 Py_ssize_t startpos, Py_ssize_t endpos,
235 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000236
Christian Heimes190d79e2008-01-30 11:58:22 +0000237/* Same for linebreaks */
238static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000240/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000241/* 0x000B, * LINE TABULATION */
242/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000243/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000244 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000245 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000246/* 0x001C, * FILE SEPARATOR */
247/* 0x001D, * GROUP SEPARATOR */
248/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000249 0, 0, 0, 0, 1, 1, 1, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000254
Benjamin Peterson14339b62009-01-31 16:36:08 +0000255 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000263};
264
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300265/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
266 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000267Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000268PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000269{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000270#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000272#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 /* This is actually an illegal character, so it should
274 not be passed to unichr. */
275 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000276#endif
277}
278
Victor Stinner910337b2011-10-03 03:20:16 +0200279#ifdef Py_DEBUG
280static int
281_PyUnicode_CheckConsistency(void *op)
282{
283 PyASCIIObject *ascii;
284 unsigned int kind;
285
286 assert(PyUnicode_Check(op));
287
288 ascii = (PyASCIIObject *)op;
289 kind = ascii->state.kind;
290
Victor Stinnera3b334d2011-10-03 13:53:37 +0200291 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200292 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200293 assert(ascii->state.ready == 1);
294 }
295 else if (ascii->state.compact == 1) {
Victor Stinner85041a52011-10-03 14:42:39 +0200296 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner910337b2011-10-03 03:20:16 +0200297 assert(kind == PyUnicode_1BYTE_KIND
298 || kind == PyUnicode_2BYTE_KIND
299 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200300 assert(ascii->state.ascii == 0);
301 assert(ascii->state.ready == 1);
Victor Stinner85041a52011-10-03 14:42:39 +0200302 assert (compact->utf8 != (void*)(compact + 1));
Victor Stinner910337b2011-10-03 03:20:16 +0200303 } else {
304 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
305 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
306
307 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnera3b334d2011-10-03 13:53:37 +0200308 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200309 assert(ascii->state.ascii == 0);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200310 assert(ascii->state.ready == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200311 assert(ascii->wstr != NULL);
312 assert(unicode->data.any == NULL);
313 assert(compact->utf8 == NULL);
314 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
315 }
316 else {
317 assert(kind == PyUnicode_1BYTE_KIND
318 || kind == PyUnicode_2BYTE_KIND
319 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200320 assert(ascii->state.compact == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 assert(unicode->data.any != NULL);
Victor Stinner85041a52011-10-03 14:42:39 +0200323 if (ascii->state.ascii)
324 assert (compact->utf8 == unicode->data.any);
325 else
326 assert (compact->utf8 != unicode->data.any);
Victor Stinner910337b2011-10-03 03:20:16 +0200327 }
328 }
329 return 1;
330}
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333/* --- Bloom Filters ----------------------------------------------------- */
334
335/* stuff to implement simple "bloom filters" for Unicode characters.
336 to keep things simple, we use a single bitmask, using the least 5
337 bits from each unicode characters as the bit index. */
338
339/* the linebreak mask is set up by Unicode_Init below */
340
Antoine Pitrouf068f942010-01-13 14:19:12 +0000341#if LONG_BIT >= 128
342#define BLOOM_WIDTH 128
343#elif LONG_BIT >= 64
344#define BLOOM_WIDTH 64
345#elif LONG_BIT >= 32
346#define BLOOM_WIDTH 32
347#else
348#error "LONG_BIT is smaller than 32"
349#endif
350
Thomas Wouters477c8d52006-05-27 19:21:47 +0000351#define BLOOM_MASK unsigned long
352
353static BLOOM_MASK bloom_linebreak;
354
Antoine Pitrouf068f942010-01-13 14:19:12 +0000355#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
356#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000357
Benjamin Peterson29060642009-01-31 22:14:21 +0000358#define BLOOM_LINEBREAK(ch) \
359 ((ch) < 128U ? ascii_linebreak[(ch)] : \
360 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000361
Alexander Belopolsky40018472011-02-26 01:02:56 +0000362Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200363make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000364{
365 /* calculate simple bloom-style bitmask for a given unicode string */
366
Antoine Pitrouf068f942010-01-13 14:19:12 +0000367 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000368 Py_ssize_t i;
369
370 mask = 0;
371 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200372 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000373
374 return mask;
375}
376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200377#define BLOOM_MEMBER(mask, chr, str) \
378 (BLOOM(mask, chr) \
379 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381/* --- Unicode Object ----------------------------------------------------- */
382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200383static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200384fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
385
386Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
387 Py_ssize_t size, Py_UCS4 ch,
388 int direction)
389{
390 /* like wcschr, but doesn't stop at NULL characters */
391 Py_ssize_t i;
392 if (direction == 1) {
393 for(i = 0; i < size; i++)
394 if (PyUnicode_READ(kind, s, i) == ch)
395 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
396 }
397 else {
398 for(i = size-1; i >= 0; i--)
399 if (PyUnicode_READ(kind, s, i) == ch)
400 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
401 }
402 return NULL;
403}
404
Victor Stinnerfe226c02011-10-03 03:52:20 +0200405static PyObject*
406resize_compact(PyObject *unicode, Py_ssize_t length)
407{
408 Py_ssize_t char_size;
409 Py_ssize_t struct_size;
410 Py_ssize_t new_size;
411 int share_wstr;
412
413 assert(PyUnicode_IS_READY(unicode));
414 char_size = PyUnicode_CHARACTER_SIZE(unicode);
415 if (PyUnicode_IS_COMPACT_ASCII(unicode))
416 struct_size = sizeof(PyASCIIObject);
417 else
418 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200419 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200420
421 _Py_DEC_REFTOTAL;
422 _Py_ForgetReference(unicode);
423
424 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
425 PyErr_NoMemory();
426 return NULL;
427 }
428 new_size = (struct_size + (length + 1) * char_size);
429
430 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
431 if (unicode == NULL) {
432 PyObject_Del(unicode);
433 PyErr_NoMemory();
434 return NULL;
435 }
436 _Py_NewReference(unicode);
437 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200438 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200439 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200440 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
441 _PyUnicode_WSTR_LENGTH(unicode) = length;
442 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200443 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
444 length, 0);
445 return unicode;
446}
447
Alexander Belopolsky40018472011-02-26 01:02:56 +0000448static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200449resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450{
451 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200453 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
Victor Stinnerfe226c02011-10-03 03:52:20 +0200455 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200456 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000457
Victor Stinnerfe226c02011-10-03 03:52:20 +0200458 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
459 {
460 PyObject_DEL(_PyUnicode_UTF8(unicode));
461 _PyUnicode_UTF8(unicode) = NULL;
462 }
463
464 if (PyUnicode_IS_READY(unicode)) {
465 Py_ssize_t char_size;
466 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200467 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200468 void *data;
469
470 data = _PyUnicode_DATA_ANY(unicode);
471 assert(data != NULL);
472 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200473 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
474 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200475
476 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
477 PyErr_NoMemory();
478 return -1;
479 }
480 new_size = (length + 1) * char_size;
481
482 data = (PyObject *)PyObject_REALLOC(data, new_size);
483 if (data == NULL) {
484 PyErr_NoMemory();
485 return -1;
486 }
487 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200489 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200490 _PyUnicode_WSTR_LENGTH(unicode) = length;
491 }
492 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200493 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200494 _PyUnicode_UTF8_LENGTH(unicode) = length;
495 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200496 _PyUnicode_LENGTH(unicode) = length;
497 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
498 if (share_wstr)
499 return 0;
500 }
501 if (_PyUnicode_WSTR(unicode) != NULL) {
502 assert(_PyUnicode_WSTR(unicode) != NULL);
503
504 oldstr = _PyUnicode_WSTR(unicode);
505 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
506 sizeof(Py_UNICODE) * (length + 1));
507 if (!_PyUnicode_WSTR(unicode)) {
508 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
509 PyErr_NoMemory();
510 return -1;
511 }
512 _PyUnicode_WSTR(unicode)[length] = 0;
513 _PyUnicode_WSTR_LENGTH(unicode) = length;
514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515 return 0;
516}
517
Victor Stinnerfe226c02011-10-03 03:52:20 +0200518static PyObject*
519resize_copy(PyObject *unicode, Py_ssize_t length)
520{
521 Py_ssize_t copy_length;
522 if (PyUnicode_IS_COMPACT(unicode)) {
523 PyObject *copy;
524 assert(PyUnicode_IS_READY(unicode));
525
526 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
527 if (copy == NULL)
528 return NULL;
529
530 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
531 if (PyUnicode_CopyCharacters(copy, 0,
532 unicode, 0,
533 copy_length) < 0)
534 {
535 Py_DECREF(copy);
536 return NULL;
537 }
538 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200539 }
540 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200541 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 assert(_PyUnicode_WSTR(unicode) != NULL);
543 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200544 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200545 if (w == NULL)
546 return NULL;
547 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
548 copy_length = Py_MIN(copy_length, length);
549 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
550 copy_length);
551 return (PyObject*)w;
552 }
553}
554
Guido van Rossumd57fd912000-03-10 22:53:23 +0000555/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000556 Ux0000 terminated; some code (e.g. new_identifier)
557 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558
559 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000560 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561
562*/
563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564#ifdef Py_DEBUG
565int unicode_old_new_calls = 0;
566#endif
567
Alexander Belopolsky40018472011-02-26 01:02:56 +0000568static PyUnicodeObject *
569_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570{
571 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200572 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573
Thomas Wouters477c8d52006-05-27 19:21:47 +0000574 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575 if (length == 0 && unicode_empty != NULL) {
576 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200577 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000578 }
579
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000580 /* Ensure we won't overflow the size. */
581 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
582 return (PyUnicodeObject *)PyErr_NoMemory();
583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200584 if (length < 0) {
585 PyErr_SetString(PyExc_SystemError,
586 "Negative size passed to _PyUnicode_New");
587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 }
589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200590#ifdef Py_DEBUG
591 ++unicode_old_new_calls;
592#endif
593
594 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
595 if (unicode == NULL)
596 return NULL;
597 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
598 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
599 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000600 PyErr_NoMemory();
601 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200603
Jeremy Hyltond8082792003-09-16 19:41:39 +0000604 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000605 * the caller fails before initializing str -- unicode_resize()
606 * reads str[0], and the Keep-Alive optimization can keep memory
607 * allocated for str alive across a call to unicode_dealloc(unicode).
608 * We don't want unicode_resize to read uninitialized memory in
609 * that case.
610 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611 _PyUnicode_WSTR(unicode)[0] = 0;
612 _PyUnicode_WSTR(unicode)[length] = 0;
613 _PyUnicode_WSTR_LENGTH(unicode) = length;
614 _PyUnicode_HASH(unicode) = -1;
615 _PyUnicode_STATE(unicode).interned = 0;
616 _PyUnicode_STATE(unicode).kind = 0;
617 _PyUnicode_STATE(unicode).compact = 0;
618 _PyUnicode_STATE(unicode).ready = 0;
619 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200620 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200622 _PyUnicode_UTF8(unicode) = NULL;
623 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000625
Benjamin Peterson29060642009-01-31 22:14:21 +0000626 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000627 /* XXX UNREF/NEWREF interface should be more symmetrical */
628 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000629 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000630 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632}
633
Victor Stinnerf42dc442011-10-02 23:33:16 +0200634static const char*
635unicode_kind_name(PyObject *unicode)
636{
Victor Stinner42dfd712011-10-03 14:41:45 +0200637 /* don't check consistency: unicode_kind_name() is called from
638 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200639 if (!PyUnicode_IS_COMPACT(unicode))
640 {
641 if (!PyUnicode_IS_READY(unicode))
642 return "wstr";
643 switch(PyUnicode_KIND(unicode))
644 {
645 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200646 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200647 return "legacy ascii";
648 else
649 return "legacy latin1";
650 case PyUnicode_2BYTE_KIND:
651 return "legacy UCS2";
652 case PyUnicode_4BYTE_KIND:
653 return "legacy UCS4";
654 default:
655 return "<legacy invalid kind>";
656 }
657 }
658 assert(PyUnicode_IS_READY(unicode));
659 switch(PyUnicode_KIND(unicode))
660 {
661 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200662 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200663 return "ascii";
664 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200665 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200666 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200667 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200668 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200669 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200670 default:
671 return "<invalid compact kind>";
672 }
673}
674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675#ifdef Py_DEBUG
676int unicode_new_new_calls = 0;
677
678/* Functions wrapping macros for use in debugger */
679char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200680 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200681}
682
683void *_PyUnicode_compact_data(void *unicode) {
684 return _PyUnicode_COMPACT_DATA(unicode);
685}
686void *_PyUnicode_data(void *unicode){
687 printf("obj %p\n", unicode);
688 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
689 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
690 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
691 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
692 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
693 return PyUnicode_DATA(unicode);
694}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200695
696void
697_PyUnicode_Dump(PyObject *op)
698{
699 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200700 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
701 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
702 void *data;
703 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
704 if (ascii->state.compact)
705 data = (compact + 1);
706 else
707 data = unicode->data.any;
708 if (ascii->wstr == data)
709 printf("shared ");
710 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200711 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200712 printf(" (%zu), ", compact->wstr_length);
713 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
714 printf("shared ");
715 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200716 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200717 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200718}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719#endif
720
721PyObject *
722PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
723{
724 PyObject *obj;
725 PyCompactUnicodeObject *unicode;
726 void *data;
727 int kind_state;
728 int is_sharing = 0, is_ascii = 0;
729 Py_ssize_t char_size;
730 Py_ssize_t struct_size;
731
732 /* Optimization for empty strings */
733 if (size == 0 && unicode_empty != NULL) {
734 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200735 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 }
737
738#ifdef Py_DEBUG
739 ++unicode_new_new_calls;
740#endif
741
742 struct_size = sizeof(PyCompactUnicodeObject);
743 if (maxchar < 128) {
744 kind_state = PyUnicode_1BYTE_KIND;
745 char_size = 1;
746 is_ascii = 1;
747 struct_size = sizeof(PyASCIIObject);
748 }
749 else if (maxchar < 256) {
750 kind_state = PyUnicode_1BYTE_KIND;
751 char_size = 1;
752 }
753 else if (maxchar < 65536) {
754 kind_state = PyUnicode_2BYTE_KIND;
755 char_size = 2;
756 if (sizeof(wchar_t) == 2)
757 is_sharing = 1;
758 }
759 else {
760 kind_state = PyUnicode_4BYTE_KIND;
761 char_size = 4;
762 if (sizeof(wchar_t) == 4)
763 is_sharing = 1;
764 }
765
766 /* Ensure we won't overflow the size. */
767 if (size < 0) {
768 PyErr_SetString(PyExc_SystemError,
769 "Negative size passed to PyUnicode_New");
770 return NULL;
771 }
772 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
773 return PyErr_NoMemory();
774
775 /* Duplicated allocation code from _PyObject_New() instead of a call to
776 * PyObject_New() so we are able to allocate space for the object and
777 * it's data buffer.
778 */
779 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
780 if (obj == NULL)
781 return PyErr_NoMemory();
782 obj = PyObject_INIT(obj, &PyUnicode_Type);
783 if (obj == NULL)
784 return NULL;
785
786 unicode = (PyCompactUnicodeObject *)obj;
787 if (is_ascii)
788 data = ((PyASCIIObject*)obj) + 1;
789 else
790 data = unicode + 1;
791 _PyUnicode_LENGTH(unicode) = size;
792 _PyUnicode_HASH(unicode) = -1;
793 _PyUnicode_STATE(unicode).interned = 0;
794 _PyUnicode_STATE(unicode).kind = kind_state;
795 _PyUnicode_STATE(unicode).compact = 1;
796 _PyUnicode_STATE(unicode).ready = 1;
797 _PyUnicode_STATE(unicode).ascii = is_ascii;
798 if (is_ascii) {
799 ((char*)data)[size] = 0;
800 _PyUnicode_WSTR(unicode) = NULL;
801 }
802 else if (kind_state == PyUnicode_1BYTE_KIND) {
803 ((char*)data)[size] = 0;
804 _PyUnicode_WSTR(unicode) = NULL;
805 _PyUnicode_WSTR_LENGTH(unicode) = 0;
806 unicode->utf8_length = 0;
807 unicode->utf8 = NULL;
808 }
809 else {
810 unicode->utf8 = NULL;
811 if (kind_state == PyUnicode_2BYTE_KIND)
812 ((Py_UCS2*)data)[size] = 0;
813 else /* kind_state == PyUnicode_4BYTE_KIND */
814 ((Py_UCS4*)data)[size] = 0;
815 if (is_sharing) {
816 _PyUnicode_WSTR_LENGTH(unicode) = size;
817 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
818 }
819 else {
820 _PyUnicode_WSTR_LENGTH(unicode) = 0;
821 _PyUnicode_WSTR(unicode) = NULL;
822 }
823 }
824 return obj;
825}
826
827#if SIZEOF_WCHAR_T == 2
828/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
829 will decode surrogate pairs, the other conversions are implemented as macros
830 for efficency.
831
832 This function assumes that unicode can hold one more code point than wstr
833 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200834static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
836 PyUnicodeObject *unicode)
837{
838 const wchar_t *iter;
839 Py_UCS4 *ucs4_out;
840
Victor Stinner910337b2011-10-03 03:20:16 +0200841 assert(unicode != NULL);
842 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200843 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
844 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
845
846 for (iter = begin; iter < end; ) {
847 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
848 _PyUnicode_GET_LENGTH(unicode)));
849 if (*iter >= 0xD800 && *iter <= 0xDBFF
850 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
851 {
852 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
853 iter += 2;
854 }
855 else {
856 *ucs4_out++ = *iter;
857 iter++;
858 }
859 }
860 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
861 _PyUnicode_GET_LENGTH(unicode)));
862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863}
864#endif
865
Victor Stinnercd9950f2011-10-02 00:34:53 +0200866static int
867_PyUnicode_Dirty(PyObject *unicode)
868{
Victor Stinner910337b2011-10-03 03:20:16 +0200869 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200870 if (Py_REFCNT(unicode) != 1) {
871 PyErr_SetString(PyExc_ValueError,
872 "Cannot modify a string having more than 1 reference");
873 return -1;
874 }
875 _PyUnicode_DIRTY(unicode);
876 return 0;
877}
878
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200879Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
881 PyObject *from, Py_ssize_t from_start,
882 Py_ssize_t how_many)
883{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200884 unsigned int from_kind, to_kind;
885 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200886
Victor Stinnerb1536152011-09-30 02:26:10 +0200887 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
888 PyErr_BadInternalCall();
889 return -1;
890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891
892 if (PyUnicode_READY(from))
893 return -1;
894 if (PyUnicode_READY(to))
895 return -1;
896
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200897 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200898 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
899 PyErr_Format(PyExc_ValueError,
900 "Cannot write %zi characters at %zi "
901 "in a string of %zi characters",
902 how_many, to_start, PyUnicode_GET_LENGTH(to));
903 return -1;
904 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200905 if (how_many == 0)
906 return 0;
907
Victor Stinnercd9950f2011-10-02 00:34:53 +0200908 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200909 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200912 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200914 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915
Victor Stinnerf42dc442011-10-02 23:33:16 +0200916 if (from_kind == to_kind
917 /* deny latin1 => ascii */
918 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
919 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200920 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200921 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200922 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200923 + PyUnicode_KIND_SIZE(from_kind, from_start),
924 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200925 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200926 else if (from_kind == PyUnicode_1BYTE_KIND
927 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200928 {
929 _PyUnicode_CONVERT_BYTES(
930 Py_UCS1, Py_UCS2,
931 PyUnicode_1BYTE_DATA(from) + from_start,
932 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
933 PyUnicode_2BYTE_DATA(to) + to_start
934 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200935 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200936 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200937 && to_kind == PyUnicode_4BYTE_KIND)
938 {
939 _PyUnicode_CONVERT_BYTES(
940 Py_UCS1, Py_UCS4,
941 PyUnicode_1BYTE_DATA(from) + from_start,
942 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
943 PyUnicode_4BYTE_DATA(to) + to_start
944 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200945 }
946 else if (from_kind == PyUnicode_2BYTE_KIND
947 && to_kind == PyUnicode_4BYTE_KIND)
948 {
949 _PyUnicode_CONVERT_BYTES(
950 Py_UCS2, Py_UCS4,
951 PyUnicode_2BYTE_DATA(from) + from_start,
952 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
953 PyUnicode_4BYTE_DATA(to) + to_start
954 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200955 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200956 else {
957 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958
959 /* check if max_char(from substring) <= max_char(to) */
960 if (from_kind > to_kind
961 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +0200962 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +0200963 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200965 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 /* slow path to check for character overflow */
967 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
968 Py_UCS4 ch, maxchar;
969 Py_ssize_t i;
970
971 maxchar = 0;
972 invalid_kinds = 0;
973 for (i=0; i < how_many; i++) {
974 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
975 if (ch > maxchar) {
976 maxchar = ch;
977 if (maxchar > to_maxchar) {
978 invalid_kinds = 1;
979 break;
980 }
981 }
982 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
983 }
984 }
985 else
986 invalid_kinds = 1;
987 if (invalid_kinds) {
988 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200989 "Cannot copy %s characters "
990 "into a string of %s characters",
991 unicode_kind_name(from),
992 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200993 return -1;
994 }
995 }
996 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997}
998
Victor Stinner17222162011-09-28 22:15:37 +0200999/* Find the maximum code point and count the number of surrogate pairs so a
1000 correct string length can be computed before converting a string to UCS4.
1001 This function counts single surrogates as a character and not as a pair.
1002
1003 Return 0 on success, or -1 on error. */
1004static int
1005find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1006 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007{
1008 const wchar_t *iter;
1009
Victor Stinnerc53be962011-10-02 21:33:54 +02001010 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001011 if (num_surrogates == NULL || maxchar == NULL) {
1012 PyErr_SetString(PyExc_SystemError,
1013 "unexpected NULL arguments to "
1014 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1015 return -1;
1016 }
1017
1018 *num_surrogates = 0;
1019 *maxchar = 0;
1020
1021 for (iter = begin; iter < end; ) {
1022 if (*iter > *maxchar)
1023 *maxchar = *iter;
1024#if SIZEOF_WCHAR_T == 2
1025 if (*iter >= 0xD800 && *iter <= 0xDBFF
1026 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1027 {
1028 Py_UCS4 surrogate_val;
1029 surrogate_val = (((iter[0] & 0x3FF)<<10)
1030 | (iter[1] & 0x3FF)) + 0x10000;
1031 ++(*num_surrogates);
1032 if (surrogate_val > *maxchar)
1033 *maxchar = surrogate_val;
1034 iter += 2;
1035 }
1036 else
1037 iter++;
1038#else
1039 iter++;
1040#endif
1041 }
1042 return 0;
1043}
1044
1045#ifdef Py_DEBUG
1046int unicode_ready_calls = 0;
1047#endif
1048
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001049static int
1050unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001052 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 wchar_t *end;
1054 Py_UCS4 maxchar = 0;
1055 Py_ssize_t num_surrogates;
1056#if SIZEOF_WCHAR_T == 2
1057 Py_ssize_t length_wo_surrogates;
1058#endif
1059
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001060 assert(p_obj != NULL);
1061 unicode = (PyUnicodeObject *)*p_obj;
1062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001064 strings were created using _PyObject_New() and where no canonical
1065 representation (the str field) has been set yet aka strings
1066 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001067 assert(_PyUnicode_CHECK(unicode));
1068 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001070 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001071 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001072 /* Actually, it should neither be interned nor be anything else: */
1073 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075#ifdef Py_DEBUG
1076 ++unicode_ready_calls;
1077#endif
1078
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001079#ifdef Py_DEBUG
1080 assert(!replace || Py_REFCNT(unicode) == 1);
1081#else
1082 if (replace && Py_REFCNT(unicode) != 1)
1083 replace = 0;
1084#endif
1085 if (replace) {
1086 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1087 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1088 /* Optimization for empty strings */
1089 if (len == 0) {
1090 Py_INCREF(unicode_empty);
1091 Py_DECREF(*p_obj);
1092 *p_obj = unicode_empty;
1093 return 0;
1094 }
1095 if (len == 1 && wstr[0] < 256) {
1096 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1097 if (latin1_char == NULL)
1098 return -1;
1099 Py_DECREF(*p_obj);
1100 *p_obj = latin1_char;
1101 return 0;
1102 }
1103 }
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001106 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001107 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109
1110 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001111 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1112 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 PyErr_NoMemory();
1114 return -1;
1115 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001116 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 _PyUnicode_WSTR(unicode), end,
1118 PyUnicode_1BYTE_DATA(unicode));
1119 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1120 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1121 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1122 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001123 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 }
1127 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001128 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001129 _PyUnicode_UTF8(unicode) = NULL;
1130 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 }
1132 PyObject_FREE(_PyUnicode_WSTR(unicode));
1133 _PyUnicode_WSTR(unicode) = NULL;
1134 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1135 }
1136 /* In this case we might have to convert down from 4-byte native
1137 wchar_t to 2-byte unicode. */
1138 else if (maxchar < 65536) {
1139 assert(num_surrogates == 0 &&
1140 "FindMaxCharAndNumSurrogatePairs() messed up");
1141
Victor Stinner506f5922011-09-28 22:34:18 +02001142#if SIZEOF_WCHAR_T == 2
1143 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001144 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001145 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1146 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1147 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001148 _PyUnicode_UTF8(unicode) = NULL;
1149 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001150#else
1151 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001152 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001153 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001154 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001155 PyErr_NoMemory();
1156 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 }
Victor Stinner506f5922011-09-28 22:34:18 +02001158 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1159 _PyUnicode_WSTR(unicode), end,
1160 PyUnicode_2BYTE_DATA(unicode));
1161 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1162 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1163 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001164 _PyUnicode_UTF8(unicode) = NULL;
1165 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001166 PyObject_FREE(_PyUnicode_WSTR(unicode));
1167 _PyUnicode_WSTR(unicode) = NULL;
1168 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1169#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170 }
1171 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1172 else {
1173#if SIZEOF_WCHAR_T == 2
1174 /* in case the native representation is 2-bytes, we need to allocate a
1175 new normalized 4-byte version. */
1176 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001177 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1178 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001179 PyErr_NoMemory();
1180 return -1;
1181 }
1182 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1183 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001184 _PyUnicode_UTF8(unicode) = NULL;
1185 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001186 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1187 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001188 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 PyObject_FREE(_PyUnicode_WSTR(unicode));
1190 _PyUnicode_WSTR(unicode) = NULL;
1191 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1192#else
1193 assert(num_surrogates == 0);
1194
Victor Stinnerc3c74152011-10-02 20:39:55 +02001195 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001197 _PyUnicode_UTF8(unicode) = NULL;
1198 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1200#endif
1201 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1202 }
1203 _PyUnicode_STATE(unicode).ready = 1;
1204 return 0;
1205}
1206
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001207int
1208_PyUnicode_ReadyReplace(PyObject **op)
1209{
1210 return unicode_ready(op, 1);
1211}
1212
1213int
1214_PyUnicode_Ready(PyObject *op)
1215{
1216 return unicode_ready(&op, 0);
1217}
1218
Alexander Belopolsky40018472011-02-26 01:02:56 +00001219static void
1220unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221{
Walter Dörwald16807132007-05-25 13:52:07 +00001222 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001223 case SSTATE_NOT_INTERNED:
1224 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001225
Benjamin Peterson29060642009-01-31 22:14:21 +00001226 case SSTATE_INTERNED_MORTAL:
1227 /* revive dead object temporarily for DelItem */
1228 Py_REFCNT(unicode) = 3;
1229 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1230 Py_FatalError(
1231 "deletion of interned string failed");
1232 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001233
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 case SSTATE_INTERNED_IMMORTAL:
1235 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001236
Benjamin Peterson29060642009-01-31 22:14:21 +00001237 default:
1238 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001239 }
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241 if (_PyUnicode_WSTR(unicode) &&
1242 (!PyUnicode_IS_READY(unicode) ||
1243 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1244 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001245 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001246 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247
1248 if (PyUnicode_IS_COMPACT(unicode)) {
1249 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 }
1251 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001252 if (_PyUnicode_DATA_ANY(unicode))
1253 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001254 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 }
1256}
1257
Alexander Belopolsky40018472011-02-26 01:02:56 +00001258static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001259unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001260{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001261 if (Py_REFCNT(unicode) != 1)
1262 return 0;
1263 if (PyUnicode_CHECK_INTERNED(unicode))
1264 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001265 assert (unicode != unicode_empty);
1266#ifdef Py_DEBUG
1267 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1268 && PyUnicode_GET_LENGTH(unicode) == 1)
1269 {
1270 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001271 if (ch < 256 && unicode_latin1[ch] == unicode)
1272 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001273 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001274#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001275 return 1;
1276}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001277
Victor Stinnerfe226c02011-10-03 03:52:20 +02001278static int
1279unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1280{
1281 PyObject *unicode;
1282 Py_ssize_t old_length;
1283
1284 assert(p_unicode != NULL);
1285 unicode = *p_unicode;
1286
1287 assert(unicode != NULL);
1288 assert(PyUnicode_Check(unicode));
1289 assert(0 <= length);
1290
Victor Stinner910337b2011-10-03 03:20:16 +02001291 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001292 old_length = PyUnicode_WSTR_LENGTH(unicode);
1293 else
1294 old_length = PyUnicode_GET_LENGTH(unicode);
1295 if (old_length == length)
1296 return 0;
1297
Victor Stinnerfe226c02011-10-03 03:52:20 +02001298 if (!unicode_resizable(unicode)) {
1299 PyObject *copy = resize_copy(unicode, length);
1300 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001301 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001302 Py_DECREF(*p_unicode);
1303 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001304 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001305 }
1306
Victor Stinnerfe226c02011-10-03 03:52:20 +02001307 if (PyUnicode_IS_COMPACT(unicode)) {
1308 *p_unicode = resize_compact(unicode, length);
1309 if (*p_unicode == NULL)
1310 return -1;
1311 return 0;
1312 } else
1313 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001314}
1315
Alexander Belopolsky40018472011-02-26 01:02:56 +00001316int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001317PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001318{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001319 PyObject *unicode;
1320 if (p_unicode == NULL) {
1321 PyErr_BadInternalCall();
1322 return -1;
1323 }
1324 unicode = *p_unicode;
1325 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1326 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1327 {
1328 PyErr_BadInternalCall();
1329 return -1;
1330 }
1331 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001332}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334static PyObject*
1335get_latin1_char(unsigned char ch)
1336{
Victor Stinnera464fc12011-10-02 20:39:30 +02001337 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001339 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 if (!unicode)
1341 return NULL;
1342 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1343 unicode_latin1[ch] = unicode;
1344 }
1345 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001346 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347}
1348
Alexander Belopolsky40018472011-02-26 01:02:56 +00001349PyObject *
1350PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001351{
1352 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353 Py_UCS4 maxchar = 0;
1354 Py_ssize_t num_surrogates;
1355
1356 if (u == NULL)
1357 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001359 /* If the Unicode data is known at construction time, we can apply
1360 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 /* Optimization for empty strings */
1363 if (size == 0 && unicode_empty != NULL) {
1364 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001365 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001366 }
Tim Petersced69f82003-09-16 20:30:58 +00001367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 /* Single character Unicode objects in the Latin-1 range are
1369 shared when using this constructor */
1370 if (size == 1 && *u < 256)
1371 return get_latin1_char((unsigned char)*u);
1372
1373 /* If not empty and not single character, copy the Unicode data
1374 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 if (find_maxchar_surrogates(u, u + size,
1376 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 return NULL;
1378
1379 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1380 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 if (!unicode)
1382 return NULL;
1383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 switch (PyUnicode_KIND(unicode)) {
1385 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001386 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1388 break;
1389 case PyUnicode_2BYTE_KIND:
1390#if Py_UNICODE_SIZE == 2
1391 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1392#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001393 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1395#endif
1396 break;
1397 case PyUnicode_4BYTE_KIND:
1398#if SIZEOF_WCHAR_T == 2
1399 /* This is the only case which has to process surrogates, thus
1400 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001401 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402#else
1403 assert(num_surrogates == 0);
1404 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1405#endif
1406 break;
1407 default:
1408 assert(0 && "Impossible state");
1409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410
1411 return (PyObject *)unicode;
1412}
1413
Alexander Belopolsky40018472011-02-26 01:02:56 +00001414PyObject *
1415PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001416{
1417 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001418
Benjamin Peterson14339b62009-01-31 16:36:08 +00001419 if (size < 0) {
1420 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001421 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001422 return NULL;
1423 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001424
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001425 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001426 some optimizations which share commonly used objects.
1427 Also, this means the input must be UTF-8, so fall back to the
1428 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001429 if (u != NULL) {
1430
Benjamin Peterson29060642009-01-31 22:14:21 +00001431 /* Optimization for empty strings */
1432 if (size == 0 && unicode_empty != NULL) {
1433 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001434 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001436
1437 /* Single characters are shared when using this constructor.
1438 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 if (size == 1 && Py_CHARMASK(*u) < 128)
1440 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001441
1442 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001443 }
1444
Walter Dörwald55507312007-05-18 13:12:10 +00001445 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001446 if (!unicode)
1447 return NULL;
1448
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001449 return (PyObject *)unicode;
1450}
1451
Alexander Belopolsky40018472011-02-26 01:02:56 +00001452PyObject *
1453PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001454{
1455 size_t size = strlen(u);
1456 if (size > PY_SSIZE_T_MAX) {
1457 PyErr_SetString(PyExc_OverflowError, "input too long");
1458 return NULL;
1459 }
1460
1461 return PyUnicode_FromStringAndSize(u, size);
1462}
1463
Victor Stinnere57b1c02011-09-28 22:20:48 +02001464static PyObject*
1465_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyObject *res;
1468 unsigned char max = 127;
1469 Py_ssize_t i;
1470 for (i = 0; i < size; i++) {
1471 if (u[i] & 0x80) {
1472 max = 255;
1473 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001474 }
1475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 res = PyUnicode_New(size, max);
1477 if (!res)
1478 return NULL;
1479 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1480 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001481}
1482
Victor Stinnere57b1c02011-09-28 22:20:48 +02001483static PyObject*
1484_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485{
1486 PyObject *res;
1487 Py_UCS2 max = 0;
1488 Py_ssize_t i;
1489 for (i = 0; i < size; i++)
1490 if (u[i] > max)
1491 max = u[i];
1492 res = PyUnicode_New(size, max);
1493 if (!res)
1494 return NULL;
1495 if (max >= 256)
1496 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1497 else
1498 for (i = 0; i < size; i++)
1499 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1500 return res;
1501}
1502
Victor Stinnere57b1c02011-09-28 22:20:48 +02001503static PyObject*
1504_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505{
1506 PyObject *res;
1507 Py_UCS4 max = 0;
1508 Py_ssize_t i;
1509 for (i = 0; i < size; i++)
1510 if (u[i] > max)
1511 max = u[i];
1512 res = PyUnicode_New(size, max);
1513 if (!res)
1514 return NULL;
1515 if (max >= 0x10000)
1516 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1517 else {
1518 int kind = PyUnicode_KIND(res);
1519 void *data = PyUnicode_DATA(res);
1520 for (i = 0; i < size; i++)
1521 PyUnicode_WRITE(kind, data, i, u[i]);
1522 }
1523 return res;
1524}
1525
1526PyObject*
1527PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1528{
1529 switch(kind) {
1530 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001531 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001533 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001535 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001537 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 return NULL;
1539}
1540
Victor Stinner034f6cf2011-09-30 02:26:44 +02001541PyObject*
1542PyUnicode_Copy(PyObject *unicode)
1543{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001544 Py_ssize_t size;
1545 PyObject *copy;
1546 void *data;
1547
Victor Stinner034f6cf2011-09-30 02:26:44 +02001548 if (!PyUnicode_Check(unicode)) {
1549 PyErr_BadInternalCall();
1550 return NULL;
1551 }
1552 if (PyUnicode_READY(unicode))
1553 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001554
1555 size = PyUnicode_GET_LENGTH(unicode);
1556 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1557 if (!copy)
1558 return NULL;
1559 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1560
1561 data = PyUnicode_DATA(unicode);
1562 switch (PyUnicode_KIND(unicode))
1563 {
1564 case PyUnicode_1BYTE_KIND:
1565 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1566 break;
1567 case PyUnicode_2BYTE_KIND:
1568 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1569 break;
1570 case PyUnicode_4BYTE_KIND:
1571 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1572 break;
1573 default:
1574 assert(0);
1575 break;
1576 }
1577 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001578}
1579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580
Victor Stinnerbc603d12011-10-02 01:00:40 +02001581/* Widen Unicode objects to larger buffers. Don't write terminating null
1582 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583
1584void*
1585_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1586{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001587 Py_ssize_t len;
1588 void *result;
1589 unsigned int skind;
1590
1591 if (PyUnicode_READY(s))
1592 return NULL;
1593
1594 len = PyUnicode_GET_LENGTH(s);
1595 skind = PyUnicode_KIND(s);
1596 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1598 return NULL;
1599 }
1600 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001601 case PyUnicode_2BYTE_KIND:
1602 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1603 if (!result)
1604 return PyErr_NoMemory();
1605 assert(skind == PyUnicode_1BYTE_KIND);
1606 _PyUnicode_CONVERT_BYTES(
1607 Py_UCS1, Py_UCS2,
1608 PyUnicode_1BYTE_DATA(s),
1609 PyUnicode_1BYTE_DATA(s) + len,
1610 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001612 case PyUnicode_4BYTE_KIND:
1613 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1614 if (!result)
1615 return PyErr_NoMemory();
1616 if (skind == PyUnicode_2BYTE_KIND) {
1617 _PyUnicode_CONVERT_BYTES(
1618 Py_UCS2, Py_UCS4,
1619 PyUnicode_2BYTE_DATA(s),
1620 PyUnicode_2BYTE_DATA(s) + len,
1621 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001622 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001623 else {
1624 assert(skind == PyUnicode_1BYTE_KIND);
1625 _PyUnicode_CONVERT_BYTES(
1626 Py_UCS1, Py_UCS4,
1627 PyUnicode_1BYTE_DATA(s),
1628 PyUnicode_1BYTE_DATA(s) + len,
1629 result);
1630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001632 default:
1633 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001635 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 return NULL;
1637}
1638
1639static Py_UCS4*
1640as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1641 int copy_null)
1642{
1643 int kind;
1644 void *data;
1645 Py_ssize_t len, targetlen;
1646 if (PyUnicode_READY(string) == -1)
1647 return NULL;
1648 kind = PyUnicode_KIND(string);
1649 data = PyUnicode_DATA(string);
1650 len = PyUnicode_GET_LENGTH(string);
1651 targetlen = len;
1652 if (copy_null)
1653 targetlen++;
1654 if (!target) {
1655 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1656 PyErr_NoMemory();
1657 return NULL;
1658 }
1659 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1660 if (!target) {
1661 PyErr_NoMemory();
1662 return NULL;
1663 }
1664 }
1665 else {
1666 if (targetsize < targetlen) {
1667 PyErr_Format(PyExc_SystemError,
1668 "string is longer than the buffer");
1669 if (copy_null && 0 < targetsize)
1670 target[0] = 0;
1671 return NULL;
1672 }
1673 }
1674 if (kind != PyUnicode_4BYTE_KIND) {
1675 Py_ssize_t i;
1676 for (i = 0; i < len; i++)
1677 target[i] = PyUnicode_READ(kind, data, i);
1678 }
1679 else
1680 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1681 if (copy_null)
1682 target[len] = 0;
1683 return target;
1684}
1685
1686Py_UCS4*
1687PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1688 int copy_null)
1689{
1690 if (target == NULL || targetsize < 1) {
1691 PyErr_BadInternalCall();
1692 return NULL;
1693 }
1694 return as_ucs4(string, target, targetsize, copy_null);
1695}
1696
1697Py_UCS4*
1698PyUnicode_AsUCS4Copy(PyObject *string)
1699{
1700 return as_ucs4(string, NULL, 0, 1);
1701}
1702
1703#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001704
Alexander Belopolsky40018472011-02-26 01:02:56 +00001705PyObject *
1706PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001709 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001711 PyErr_BadInternalCall();
1712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 }
1714
Martin v. Löwis790465f2008-04-05 20:41:37 +00001715 if (size == -1) {
1716 size = wcslen(w);
1717 }
1718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720}
1721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001723
Walter Dörwald346737f2007-05-31 10:44:43 +00001724static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001725makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1726 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001727{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001728 *fmt++ = '%';
1729 if (width) {
1730 if (zeropad)
1731 *fmt++ = '0';
1732 fmt += sprintf(fmt, "%d", width);
1733 }
1734 if (precision)
1735 fmt += sprintf(fmt, ".%d", precision);
1736 if (longflag)
1737 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001738 else if (longlongflag) {
1739 /* longlongflag should only ever be nonzero on machines with
1740 HAVE_LONG_LONG defined */
1741#ifdef HAVE_LONG_LONG
1742 char *f = PY_FORMAT_LONG_LONG;
1743 while (*f)
1744 *fmt++ = *f++;
1745#else
1746 /* we shouldn't ever get here */
1747 assert(0);
1748 *fmt++ = 'l';
1749#endif
1750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001751 else if (size_tflag) {
1752 char *f = PY_FORMAT_SIZE_T;
1753 while (*f)
1754 *fmt++ = *f++;
1755 }
1756 *fmt++ = c;
1757 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001758}
1759
Victor Stinner96865452011-03-01 23:44:09 +00001760/* helper for PyUnicode_FromFormatV() */
1761
1762static const char*
1763parse_format_flags(const char *f,
1764 int *p_width, int *p_precision,
1765 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1766{
1767 int width, precision, longflag, longlongflag, size_tflag;
1768
1769 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1770 f++;
1771 width = 0;
1772 while (Py_ISDIGIT((unsigned)*f))
1773 width = (width*10) + *f++ - '0';
1774 precision = 0;
1775 if (*f == '.') {
1776 f++;
1777 while (Py_ISDIGIT((unsigned)*f))
1778 precision = (precision*10) + *f++ - '0';
1779 if (*f == '%') {
1780 /* "%.3%s" => f points to "3" */
1781 f--;
1782 }
1783 }
1784 if (*f == '\0') {
1785 /* bogus format "%.1" => go backward, f points to "1" */
1786 f--;
1787 }
1788 if (p_width != NULL)
1789 *p_width = width;
1790 if (p_precision != NULL)
1791 *p_precision = precision;
1792
1793 /* Handle %ld, %lu, %lld and %llu. */
1794 longflag = 0;
1795 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001796 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001797
1798 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001799 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001800 longflag = 1;
1801 ++f;
1802 }
1803#ifdef HAVE_LONG_LONG
1804 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001805 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001806 longlongflag = 1;
1807 f += 2;
1808 }
1809#endif
1810 }
1811 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001812 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001813 size_tflag = 1;
1814 ++f;
1815 }
1816 if (p_longflag != NULL)
1817 *p_longflag = longflag;
1818 if (p_longlongflag != NULL)
1819 *p_longlongflag = longlongflag;
1820 if (p_size_tflag != NULL)
1821 *p_size_tflag = size_tflag;
1822 return f;
1823}
1824
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001825/* maximum number of characters required for output of %ld. 21 characters
1826 allows for 64-bit integers (in decimal) and an optional sign. */
1827#define MAX_LONG_CHARS 21
1828/* maximum number of characters required for output of %lld.
1829 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1830 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1831#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1832
Walter Dörwaldd2034312007-05-18 16:29:38 +00001833PyObject *
1834PyUnicode_FromFormatV(const char *format, va_list vargs)
1835{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001836 va_list count;
1837 Py_ssize_t callcount = 0;
1838 PyObject **callresults = NULL;
1839 PyObject **callresult = NULL;
1840 Py_ssize_t n = 0;
1841 int width = 0;
1842 int precision = 0;
1843 int zeropad;
1844 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001846 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001847 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1849 Py_UCS4 argmaxchar;
1850 Py_ssize_t numbersize = 0;
1851 char *numberresults = NULL;
1852 char *numberresult = NULL;
1853 Py_ssize_t i;
1854 int kind;
1855 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001856
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001857 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001858 /* step 1: count the number of %S/%R/%A/%s format specifications
1859 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1860 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001861 * result in an array)
1862 * also esimate a upper bound for all the number formats in the string,
1863 * numbers will be formated in step 3 and be keept in a '\0'-separated
1864 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 for (f = format; *f; f++) {
1866 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001867 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1869 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1870 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1871 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001874#ifdef HAVE_LONG_LONG
1875 if (longlongflag) {
1876 if (width < MAX_LONG_LONG_CHARS)
1877 width = MAX_LONG_LONG_CHARS;
1878 }
1879 else
1880#endif
1881 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1882 including sign. Decimal takes the most space. This
1883 isn't enough for octal. If a width is specified we
1884 need more (which we allocate later). */
1885 if (width < MAX_LONG_CHARS)
1886 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887
1888 /* account for the size + '\0' to separate numbers
1889 inside of the numberresults buffer */
1890 numbersize += (width + 1);
1891 }
1892 }
1893 else if ((unsigned char)*f > 127) {
1894 PyErr_Format(PyExc_ValueError,
1895 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1896 "string, got a non-ASCII byte: 0x%02x",
1897 (unsigned char)*f);
1898 return NULL;
1899 }
1900 }
1901 /* step 2: allocate memory for the results of
1902 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1903 if (callcount) {
1904 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1905 if (!callresults) {
1906 PyErr_NoMemory();
1907 return NULL;
1908 }
1909 callresult = callresults;
1910 }
1911 /* step 2.5: allocate memory for the results of formating numbers */
1912 if (numbersize) {
1913 numberresults = PyObject_Malloc(numbersize);
1914 if (!numberresults) {
1915 PyErr_NoMemory();
1916 goto fail;
1917 }
1918 numberresult = numberresults;
1919 }
1920
1921 /* step 3: format numbers and figure out how large a buffer we need */
1922 for (f = format; *f; f++) {
1923 if (*f == '%') {
1924 const char* p;
1925 int longflag;
1926 int longlongflag;
1927 int size_tflag;
1928 int numprinted;
1929
1930 p = f;
1931 zeropad = (f[1] == '0');
1932 f = parse_format_flags(f, &width, &precision,
1933 &longflag, &longlongflag, &size_tflag);
1934 switch (*f) {
1935 case 'c':
1936 {
1937 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001938 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 n++;
1940 break;
1941 }
1942 case '%':
1943 n++;
1944 break;
1945 case 'i':
1946 case 'd':
1947 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1948 width, precision, *f);
1949 if (longflag)
1950 numprinted = sprintf(numberresult, fmt,
1951 va_arg(count, long));
1952#ifdef HAVE_LONG_LONG
1953 else if (longlongflag)
1954 numprinted = sprintf(numberresult, fmt,
1955 va_arg(count, PY_LONG_LONG));
1956#endif
1957 else if (size_tflag)
1958 numprinted = sprintf(numberresult, fmt,
1959 va_arg(count, Py_ssize_t));
1960 else
1961 numprinted = sprintf(numberresult, fmt,
1962 va_arg(count, int));
1963 n += numprinted;
1964 /* advance by +1 to skip over the '\0' */
1965 numberresult += (numprinted + 1);
1966 assert(*(numberresult - 1) == '\0');
1967 assert(*(numberresult - 2) != '\0');
1968 assert(numprinted >= 0);
1969 assert(numberresult <= numberresults + numbersize);
1970 break;
1971 case 'u':
1972 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1973 width, precision, 'u');
1974 if (longflag)
1975 numprinted = sprintf(numberresult, fmt,
1976 va_arg(count, unsigned long));
1977#ifdef HAVE_LONG_LONG
1978 else if (longlongflag)
1979 numprinted = sprintf(numberresult, fmt,
1980 va_arg(count, unsigned PY_LONG_LONG));
1981#endif
1982 else if (size_tflag)
1983 numprinted = sprintf(numberresult, fmt,
1984 va_arg(count, size_t));
1985 else
1986 numprinted = sprintf(numberresult, fmt,
1987 va_arg(count, unsigned int));
1988 n += numprinted;
1989 numberresult += (numprinted + 1);
1990 assert(*(numberresult - 1) == '\0');
1991 assert(*(numberresult - 2) != '\0');
1992 assert(numprinted >= 0);
1993 assert(numberresult <= numberresults + numbersize);
1994 break;
1995 case 'x':
1996 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1997 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1998 n += numprinted;
1999 numberresult += (numprinted + 1);
2000 assert(*(numberresult - 1) == '\0');
2001 assert(*(numberresult - 2) != '\0');
2002 assert(numprinted >= 0);
2003 assert(numberresult <= numberresults + numbersize);
2004 break;
2005 case 'p':
2006 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2007 /* %p is ill-defined: ensure leading 0x. */
2008 if (numberresult[1] == 'X')
2009 numberresult[1] = 'x';
2010 else if (numberresult[1] != 'x') {
2011 memmove(numberresult + 2, numberresult,
2012 strlen(numberresult) + 1);
2013 numberresult[0] = '0';
2014 numberresult[1] = 'x';
2015 numprinted += 2;
2016 }
2017 n += numprinted;
2018 numberresult += (numprinted + 1);
2019 assert(*(numberresult - 1) == '\0');
2020 assert(*(numberresult - 2) != '\0');
2021 assert(numprinted >= 0);
2022 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002023 break;
2024 case 's':
2025 {
2026 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002027 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002028 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2029 if (!str)
2030 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 /* since PyUnicode_DecodeUTF8 returns already flexible
2032 unicode objects, there is no need to call ready on them */
2033 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002034 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002036 /* Remember the str and switch to the next slot */
2037 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002038 break;
2039 }
2040 case 'U':
2041 {
2042 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002043 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 if (PyUnicode_READY(obj) == -1)
2045 goto fail;
2046 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002047 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002049 break;
2050 }
2051 case 'V':
2052 {
2053 PyObject *obj = va_arg(count, PyObject *);
2054 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002055 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002056 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002057 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002058 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 if (PyUnicode_READY(obj) == -1)
2060 goto fail;
2061 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002062 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002064 *callresult++ = NULL;
2065 }
2066 else {
2067 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2068 if (!str_obj)
2069 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002071 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002072 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002073 *callresult++ = str_obj;
2074 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 break;
2076 }
2077 case 'S':
2078 {
2079 PyObject *obj = va_arg(count, PyObject *);
2080 PyObject *str;
2081 assert(obj);
2082 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002084 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002086 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 /* Remember the str and switch to the next slot */
2089 *callresult++ = str;
2090 break;
2091 }
2092 case 'R':
2093 {
2094 PyObject *obj = va_arg(count, PyObject *);
2095 PyObject *repr;
2096 assert(obj);
2097 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002099 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002101 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 /* Remember the repr and switch to the next slot */
2104 *callresult++ = repr;
2105 break;
2106 }
2107 case 'A':
2108 {
2109 PyObject *obj = va_arg(count, PyObject *);
2110 PyObject *ascii;
2111 assert(obj);
2112 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002114 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002116 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002118 /* Remember the repr and switch to the next slot */
2119 *callresult++ = ascii;
2120 break;
2121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002122 default:
2123 /* if we stumble upon an unknown
2124 formatting code, copy the rest of
2125 the format string to the output
2126 string. (we cannot just skip the
2127 code, since there's no way to know
2128 what's in the argument list) */
2129 n += strlen(p);
2130 goto expand;
2131 }
2132 } else
2133 n++;
2134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002135 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002136 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002138 we don't have to resize the string.
2139 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002141 if (!string)
2142 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 kind = PyUnicode_KIND(string);
2144 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002145 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002149 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002150 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002151
2152 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2154 /* checking for == because the last argument could be a empty
2155 string, which causes i to point to end, the assert at the end of
2156 the loop */
2157 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002158
Benjamin Peterson14339b62009-01-31 16:36:08 +00002159 switch (*f) {
2160 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002161 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 const int ordinal = va_arg(vargs, int);
2163 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002164 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002165 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002166 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002168 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 case 'p':
2171 /* unused, since we already have the result */
2172 if (*f == 'p')
2173 (void) va_arg(vargs, void *);
2174 else
2175 (void) va_arg(vargs, int);
2176 /* extract the result from numberresults and append. */
2177 for (; *numberresult; ++i, ++numberresult)
2178 PyUnicode_WRITE(kind, data, i, *numberresult);
2179 /* skip over the separating '\0' */
2180 assert(*numberresult == '\0');
2181 numberresult++;
2182 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 break;
2184 case 's':
2185 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002186 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002188 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 size = PyUnicode_GET_LENGTH(*callresult);
2190 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002191 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2192 *callresult, 0,
2193 size) < 0)
2194 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002196 /* We're done with the unicode()/repr() => forget it */
2197 Py_DECREF(*callresult);
2198 /* switch to next unicode()/repr() result */
2199 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 break;
2201 }
2202 case 'U':
2203 {
2204 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 Py_ssize_t size;
2206 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2207 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002208 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2209 obj, 0,
2210 size) < 0)
2211 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 break;
2214 }
2215 case 'V':
2216 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002217 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002219 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002220 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 size = PyUnicode_GET_LENGTH(obj);
2222 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002223 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2224 obj, 0,
2225 size) < 0)
2226 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 size = PyUnicode_GET_LENGTH(*callresult);
2230 assert(PyUnicode_KIND(*callresult) <=
2231 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002232 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2233 *callresult,
2234 0, size) < 0)
2235 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002236 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002237 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002238 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002239 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 break;
2241 }
2242 case 'S':
2243 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002244 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 /* unused, since we already have the result */
2247 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002249 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2250 *callresult, 0,
2251 PyUnicode_GET_LENGTH(*callresult)) < 0)
2252 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002254 /* We're done with the unicode()/repr() => forget it */
2255 Py_DECREF(*callresult);
2256 /* switch to next unicode()/repr() result */
2257 ++callresult;
2258 break;
2259 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002262 break;
2263 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 for (; *p; ++p, ++i)
2265 PyUnicode_WRITE(kind, data, i, *p);
2266 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 goto end;
2268 }
Victor Stinner1205f272010-09-11 00:54:47 +00002269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 else {
2271 assert(i < PyUnicode_GET_LENGTH(string));
2272 PyUnicode_WRITE(kind, data, i++, *f);
2273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Benjamin Peterson29060642009-01-31 22:14:21 +00002277 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 if (callresults)
2279 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 if (numberresults)
2281 PyObject_Free(numberresults);
2282 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002283 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 if (callresults) {
2285 PyObject **callresult2 = callresults;
2286 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002287 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002288 ++callresult2;
2289 }
2290 PyObject_Free(callresults);
2291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002292 if (numberresults)
2293 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002295}
2296
Walter Dörwaldd2034312007-05-18 16:29:38 +00002297PyObject *
2298PyUnicode_FromFormat(const char *format, ...)
2299{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 PyObject* ret;
2301 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002302
2303#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002304 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002305#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002307#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 ret = PyUnicode_FromFormatV(format, vargs);
2309 va_end(vargs);
2310 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002311}
2312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313#ifdef HAVE_WCHAR_H
2314
Victor Stinner5593d8a2010-10-02 11:11:27 +00002315/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2316 convert a Unicode object to a wide character string.
2317
Victor Stinnerd88d9832011-09-06 02:00:05 +02002318 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002319 character) required to convert the unicode object. Ignore size argument.
2320
Victor Stinnerd88d9832011-09-06 02:00:05 +02002321 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002322 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002323 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002324static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002325unicode_aswidechar(PyUnicodeObject *unicode,
2326 wchar_t *w,
2327 Py_ssize_t size)
2328{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002329 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330 const wchar_t *wstr;
2331
2332 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2333 if (wstr == NULL)
2334 return -1;
2335
Victor Stinner5593d8a2010-10-02 11:11:27 +00002336 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002337 if (size > res)
2338 size = res + 1;
2339 else
2340 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002342 return res;
2343 }
2344 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002346}
2347
2348Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002349PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002350 wchar_t *w,
2351 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352{
2353 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002354 PyErr_BadInternalCall();
2355 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002357 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358}
2359
Victor Stinner137c34c2010-09-29 10:25:54 +00002360wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002361PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002362 Py_ssize_t *size)
2363{
2364 wchar_t* buffer;
2365 Py_ssize_t buflen;
2366
2367 if (unicode == NULL) {
2368 PyErr_BadInternalCall();
2369 return NULL;
2370 }
2371
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002372 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 if (buflen == -1)
2374 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002375 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002376 PyErr_NoMemory();
2377 return NULL;
2378 }
2379
Victor Stinner137c34c2010-09-29 10:25:54 +00002380 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2381 if (buffer == NULL) {
2382 PyErr_NoMemory();
2383 return NULL;
2384 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002385 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 if (buflen == -1)
2387 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002388 if (size != NULL)
2389 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002390 return buffer;
2391}
2392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394
Alexander Belopolsky40018472011-02-26 01:02:56 +00002395PyObject *
2396PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002399 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 PyErr_SetString(PyExc_ValueError,
2401 "chr() arg not in range(0x110000)");
2402 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002403 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 if (ordinal < 256)
2406 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 v = PyUnicode_New(1, ordinal);
2409 if (v == NULL)
2410 return NULL;
2411 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2412 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002413}
2414
Alexander Belopolsky40018472011-02-26 01:02:56 +00002415PyObject *
2416PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002417{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002418 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002420 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002421 if (PyUnicode_READY(obj))
2422 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002423 Py_INCREF(obj);
2424 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002425 }
2426 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 /* For a Unicode subtype that's not a Unicode object,
2428 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002429 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002430 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002431 PyErr_Format(PyExc_TypeError,
2432 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002433 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002434 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002435}
2436
Alexander Belopolsky40018472011-02-26 01:02:56 +00002437PyObject *
2438PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002439 const char *encoding,
2440 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002441{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002442 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002443 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002444
Guido van Rossumd57fd912000-03-10 22:53:23 +00002445 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002446 PyErr_BadInternalCall();
2447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002450 /* Decoding bytes objects is the most common case and should be fast */
2451 if (PyBytes_Check(obj)) {
2452 if (PyBytes_GET_SIZE(obj) == 0) {
2453 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002454 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002455 }
2456 else {
2457 v = PyUnicode_Decode(
2458 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2459 encoding, errors);
2460 }
2461 return v;
2462 }
2463
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002464 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002465 PyErr_SetString(PyExc_TypeError,
2466 "decoding str is not supported");
2467 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002468 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002469
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002470 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2471 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2472 PyErr_Format(PyExc_TypeError,
2473 "coercing to str: need bytes, bytearray "
2474 "or buffer-like object, %.80s found",
2475 Py_TYPE(obj)->tp_name);
2476 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002477 }
Tim Petersced69f82003-09-16 20:30:58 +00002478
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002479 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002481 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
Tim Petersced69f82003-09-16 20:30:58 +00002483 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002484 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002485
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002486 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002487 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488}
2489
Victor Stinner600d3be2010-06-10 12:00:55 +00002490/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002491 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2492 1 on success. */
2493static int
2494normalize_encoding(const char *encoding,
2495 char *lower,
2496 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002498 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002499 char *l;
2500 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002501
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002502 e = encoding;
2503 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002504 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002505 while (*e) {
2506 if (l == l_end)
2507 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002508 if (Py_ISUPPER(*e)) {
2509 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002510 }
2511 else if (*e == '_') {
2512 *l++ = '-';
2513 e++;
2514 }
2515 else {
2516 *l++ = *e++;
2517 }
2518 }
2519 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002520 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002521}
2522
Alexander Belopolsky40018472011-02-26 01:02:56 +00002523PyObject *
2524PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002525 Py_ssize_t size,
2526 const char *encoding,
2527 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002528{
2529 PyObject *buffer = NULL, *unicode;
2530 Py_buffer info;
2531 char lower[11]; /* Enough for any encoding shortcut */
2532
2533 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002535
2536 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002537 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002538 if ((strcmp(lower, "utf-8") == 0) ||
2539 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002540 return PyUnicode_DecodeUTF8(s, size, errors);
2541 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002542 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002543 (strcmp(lower, "iso-8859-1") == 0))
2544 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002545#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002546 else if (strcmp(lower, "mbcs") == 0)
2547 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002548#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002549 else if (strcmp(lower, "ascii") == 0)
2550 return PyUnicode_DecodeASCII(s, size, errors);
2551 else if (strcmp(lower, "utf-16") == 0)
2552 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2553 else if (strcmp(lower, "utf-32") == 0)
2554 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002558 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002559 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002560 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002561 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562 if (buffer == NULL)
2563 goto onError;
2564 unicode = PyCodec_Decode(buffer, encoding, errors);
2565 if (unicode == NULL)
2566 goto onError;
2567 if (!PyUnicode_Check(unicode)) {
2568 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002569 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002570 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571 Py_DECREF(unicode);
2572 goto onError;
2573 }
2574 Py_DECREF(buffer);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002575 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 Py_DECREF(unicode);
2577 return NULL;
2578 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002580
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 Py_XDECREF(buffer);
2583 return NULL;
2584}
2585
Alexander Belopolsky40018472011-02-26 01:02:56 +00002586PyObject *
2587PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002588 const char *encoding,
2589 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002590{
2591 PyObject *v;
2592
2593 if (!PyUnicode_Check(unicode)) {
2594 PyErr_BadArgument();
2595 goto onError;
2596 }
2597
2598 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002599 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002600
2601 /* Decode via the codec registry */
2602 v = PyCodec_Decode(unicode, encoding, errors);
2603 if (v == NULL)
2604 goto onError;
2605 return v;
2606
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002608 return NULL;
2609}
2610
Alexander Belopolsky40018472011-02-26 01:02:56 +00002611PyObject *
2612PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002613 const char *encoding,
2614 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002615{
2616 PyObject *v;
2617
2618 if (!PyUnicode_Check(unicode)) {
2619 PyErr_BadArgument();
2620 goto onError;
2621 }
2622
2623 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002624 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002625
2626 /* Decode via the codec registry */
2627 v = PyCodec_Decode(unicode, encoding, errors);
2628 if (v == NULL)
2629 goto onError;
2630 if (!PyUnicode_Check(v)) {
2631 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002632 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002633 Py_TYPE(v)->tp_name);
2634 Py_DECREF(v);
2635 goto onError;
2636 }
2637 return v;
2638
Benjamin Peterson29060642009-01-31 22:14:21 +00002639 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002640 return NULL;
2641}
2642
Alexander Belopolsky40018472011-02-26 01:02:56 +00002643PyObject *
2644PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002645 Py_ssize_t size,
2646 const char *encoding,
2647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648{
2649 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002650
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 unicode = PyUnicode_FromUnicode(s, size);
2652 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2655 Py_DECREF(unicode);
2656 return v;
2657}
2658
Alexander Belopolsky40018472011-02-26 01:02:56 +00002659PyObject *
2660PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002661 const char *encoding,
2662 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002663{
2664 PyObject *v;
2665
2666 if (!PyUnicode_Check(unicode)) {
2667 PyErr_BadArgument();
2668 goto onError;
2669 }
2670
2671 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002673
2674 /* Encode via the codec registry */
2675 v = PyCodec_Encode(unicode, encoding, errors);
2676 if (v == NULL)
2677 goto onError;
2678 return v;
2679
Benjamin Peterson29060642009-01-31 22:14:21 +00002680 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002681 return NULL;
2682}
2683
Victor Stinnerad158722010-10-27 00:25:46 +00002684PyObject *
2685PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002686{
Victor Stinner99b95382011-07-04 14:23:54 +02002687#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002688 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2689 PyUnicode_GET_SIZE(unicode),
2690 NULL);
2691#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002693#else
Victor Stinner793b5312011-04-27 00:24:21 +02002694 PyInterpreterState *interp = PyThreadState_GET()->interp;
2695 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2696 cannot use it to encode and decode filenames before it is loaded. Load
2697 the Python codec requires to encode at least its own filename. Use the C
2698 version of the locale codec until the codec registry is initialized and
2699 the Python codec is loaded.
2700
2701 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2702 cannot only rely on it: check also interp->fscodec_initialized for
2703 subinterpreters. */
2704 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002705 return PyUnicode_AsEncodedString(unicode,
2706 Py_FileSystemDefaultEncoding,
2707 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002708 }
2709 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002710 /* locale encoding with surrogateescape */
2711 wchar_t *wchar;
2712 char *bytes;
2713 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002714 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002715
2716 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2717 if (wchar == NULL)
2718 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002719 bytes = _Py_wchar2char(wchar, &error_pos);
2720 if (bytes == NULL) {
2721 if (error_pos != (size_t)-1) {
2722 char *errmsg = strerror(errno);
2723 PyObject *exc = NULL;
2724 if (errmsg == NULL)
2725 errmsg = "Py_wchar2char() failed";
2726 raise_encode_exception(&exc,
2727 "filesystemencoding",
2728 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2729 error_pos, error_pos+1,
2730 errmsg);
2731 Py_XDECREF(exc);
2732 }
2733 else
2734 PyErr_NoMemory();
2735 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002736 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002737 }
2738 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002739
2740 bytes_obj = PyBytes_FromString(bytes);
2741 PyMem_Free(bytes);
2742 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002743 }
Victor Stinnerad158722010-10-27 00:25:46 +00002744#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002745}
2746
Alexander Belopolsky40018472011-02-26 01:02:56 +00002747PyObject *
2748PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002749 const char *encoding,
2750 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
2752 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002753 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002754
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 if (!PyUnicode_Check(unicode)) {
2756 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Fred Drakee4315f52000-05-09 19:53:39 +00002759
Victor Stinner2f283c22011-03-02 01:21:46 +00002760 if (encoding == NULL) {
2761 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002762 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002763 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002765 }
Fred Drakee4315f52000-05-09 19:53:39 +00002766
2767 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002768 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002769 if ((strcmp(lower, "utf-8") == 0) ||
2770 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002771 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002772 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002774 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002776 }
Victor Stinner37296e82010-06-10 13:36:23 +00002777 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002778 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002779 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002781#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002782 else if (strcmp(lower, "mbcs") == 0)
2783 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2784 PyUnicode_GET_SIZE(unicode),
2785 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002786#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002787 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790
2791 /* Encode via the codec registry */
2792 v = PyCodec_Encode(unicode, encoding, errors);
2793 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002794 return NULL;
2795
2796 /* The normal path */
2797 if (PyBytes_Check(v))
2798 return v;
2799
2800 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002801 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002802 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002803 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002804
2805 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2806 "encoder %s returned bytearray instead of bytes",
2807 encoding);
2808 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002809 Py_DECREF(v);
2810 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002811 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002812
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002813 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2814 Py_DECREF(v);
2815 return b;
2816 }
2817
2818 PyErr_Format(PyExc_TypeError,
2819 "encoder did not return a bytes object (type=%.400s)",
2820 Py_TYPE(v)->tp_name);
2821 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002822 return NULL;
2823}
2824
Alexander Belopolsky40018472011-02-26 01:02:56 +00002825PyObject *
2826PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002827 const char *encoding,
2828 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002829{
2830 PyObject *v;
2831
2832 if (!PyUnicode_Check(unicode)) {
2833 PyErr_BadArgument();
2834 goto onError;
2835 }
2836
2837 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002839
2840 /* Encode via the codec registry */
2841 v = PyCodec_Encode(unicode, encoding, errors);
2842 if (v == NULL)
2843 goto onError;
2844 if (!PyUnicode_Check(v)) {
2845 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002846 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002847 Py_TYPE(v)->tp_name);
2848 Py_DECREF(v);
2849 goto onError;
2850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002852
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 return NULL;
2855}
2856
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002857PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002858PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002859 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002860 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2861}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002862
Christian Heimes5894ba72007-11-04 11:43:14 +00002863PyObject*
2864PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2865{
Victor Stinner99b95382011-07-04 14:23:54 +02002866#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002867 return PyUnicode_DecodeMBCS(s, size, NULL);
2868#elif defined(__APPLE__)
2869 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2870#else
Victor Stinner793b5312011-04-27 00:24:21 +02002871 PyInterpreterState *interp = PyThreadState_GET()->interp;
2872 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2873 cannot use it to encode and decode filenames before it is loaded. Load
2874 the Python codec requires to encode at least its own filename. Use the C
2875 version of the locale codec until the codec registry is initialized and
2876 the Python codec is loaded.
2877
2878 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2879 cannot only rely on it: check also interp->fscodec_initialized for
2880 subinterpreters. */
2881 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002882 return PyUnicode_Decode(s, size,
2883 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002884 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002885 }
2886 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002887 /* locale encoding with surrogateescape */
2888 wchar_t *wchar;
2889 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002890 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002891
2892 if (s[size] != '\0' || size != strlen(s)) {
2893 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2894 return NULL;
2895 }
2896
Victor Stinner168e1172010-10-16 23:16:16 +00002897 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002898 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002899 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002900
Victor Stinner168e1172010-10-16 23:16:16 +00002901 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002902 PyMem_Free(wchar);
2903 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002904 }
Victor Stinnerad158722010-10-27 00:25:46 +00002905#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002906}
2907
Martin v. Löwis011e8422009-05-05 04:43:17 +00002908
2909int
2910PyUnicode_FSConverter(PyObject* arg, void* addr)
2911{
2912 PyObject *output = NULL;
2913 Py_ssize_t size;
2914 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002915 if (arg == NULL) {
2916 Py_DECREF(*(PyObject**)addr);
2917 return 1;
2918 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002919 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002920 output = arg;
2921 Py_INCREF(output);
2922 }
2923 else {
2924 arg = PyUnicode_FromObject(arg);
2925 if (!arg)
2926 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002927 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002928 Py_DECREF(arg);
2929 if (!output)
2930 return 0;
2931 if (!PyBytes_Check(output)) {
2932 Py_DECREF(output);
2933 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2934 return 0;
2935 }
2936 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002937 size = PyBytes_GET_SIZE(output);
2938 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002939 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002940 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002941 Py_DECREF(output);
2942 return 0;
2943 }
2944 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002945 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002946}
2947
2948
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002949int
2950PyUnicode_FSDecoder(PyObject* arg, void* addr)
2951{
2952 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002953 if (arg == NULL) {
2954 Py_DECREF(*(PyObject**)addr);
2955 return 1;
2956 }
2957 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 if (PyUnicode_READY(arg))
2959 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002960 output = arg;
2961 Py_INCREF(output);
2962 }
2963 else {
2964 arg = PyBytes_FromObject(arg);
2965 if (!arg)
2966 return 0;
2967 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2968 PyBytes_GET_SIZE(arg));
2969 Py_DECREF(arg);
2970 if (!output)
2971 return 0;
2972 if (!PyUnicode_Check(output)) {
2973 Py_DECREF(output);
2974 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2975 return 0;
2976 }
2977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002978 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2979 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002980 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2981 Py_DECREF(output);
2982 return 0;
2983 }
2984 *(PyObject**)addr = output;
2985 return Py_CLEANUP_SUPPORTED;
2986}
2987
2988
Martin v. Löwis5b222132007-06-10 09:51:05 +00002989char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002990PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002991{
Christian Heimesf3863112007-11-22 07:46:41 +00002992 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002993 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2994
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002995 if (!PyUnicode_Check(unicode)) {
2996 PyErr_BadArgument();
2997 return NULL;
2998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003000 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003001
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003002 if (PyUnicode_UTF8(unicode) == NULL) {
3003 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003004 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3005 if (bytes == NULL)
3006 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003007 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3008 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003009 Py_DECREF(bytes);
3010 return NULL;
3011 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003012 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3013 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 Py_DECREF(bytes);
3015 }
3016
3017 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003018 *psize = PyUnicode_UTF8_LENGTH(unicode);
3019 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003020}
3021
3022char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003023PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003025 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3026}
3027
3028#ifdef Py_DEBUG
3029int unicode_as_unicode_calls = 0;
3030#endif
3031
3032
3033Py_UNICODE *
3034PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3035{
3036 PyUnicodeObject *u;
3037 const unsigned char *one_byte;
3038#if SIZEOF_WCHAR_T == 4
3039 const Py_UCS2 *two_bytes;
3040#else
3041 const Py_UCS4 *four_bytes;
3042 const Py_UCS4 *ucs4_end;
3043 Py_ssize_t num_surrogates;
3044#endif
3045 wchar_t *w;
3046 wchar_t *wchar_end;
3047
3048 if (!PyUnicode_Check(unicode)) {
3049 PyErr_BadArgument();
3050 return NULL;
3051 }
3052 u = (PyUnicodeObject*)unicode;
3053 if (_PyUnicode_WSTR(u) == NULL) {
3054 /* Non-ASCII compact unicode object */
3055 assert(_PyUnicode_KIND(u) != 0);
3056 assert(PyUnicode_IS_READY(u));
3057
3058#ifdef Py_DEBUG
3059 ++unicode_as_unicode_calls;
3060#endif
3061
3062 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3063#if SIZEOF_WCHAR_T == 2
3064 four_bytes = PyUnicode_4BYTE_DATA(u);
3065 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3066 num_surrogates = 0;
3067
3068 for (; four_bytes < ucs4_end; ++four_bytes) {
3069 if (*four_bytes > 0xFFFF)
3070 ++num_surrogates;
3071 }
3072
3073 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3074 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3075 if (!_PyUnicode_WSTR(u)) {
3076 PyErr_NoMemory();
3077 return NULL;
3078 }
3079 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3080
3081 w = _PyUnicode_WSTR(u);
3082 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3083 four_bytes = PyUnicode_4BYTE_DATA(u);
3084 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3085 if (*four_bytes > 0xFFFF) {
3086 /* encode surrogate pair in this case */
3087 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3088 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3089 }
3090 else
3091 *w = *four_bytes;
3092
3093 if (w > wchar_end) {
3094 assert(0 && "Miscalculated string end");
3095 }
3096 }
3097 *w = 0;
3098#else
3099 /* sizeof(wchar_t) == 4 */
3100 Py_FatalError("Impossible unicode object state, wstr and str "
3101 "should share memory already.");
3102 return NULL;
3103#endif
3104 }
3105 else {
3106 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3107 (_PyUnicode_LENGTH(u) + 1));
3108 if (!_PyUnicode_WSTR(u)) {
3109 PyErr_NoMemory();
3110 return NULL;
3111 }
3112 if (!PyUnicode_IS_COMPACT_ASCII(u))
3113 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3114 w = _PyUnicode_WSTR(u);
3115 wchar_end = w + _PyUnicode_LENGTH(u);
3116
3117 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3118 one_byte = PyUnicode_1BYTE_DATA(u);
3119 for (; w < wchar_end; ++one_byte, ++w)
3120 *w = *one_byte;
3121 /* null-terminate the wstr */
3122 *w = 0;
3123 }
3124 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3125#if SIZEOF_WCHAR_T == 4
3126 two_bytes = PyUnicode_2BYTE_DATA(u);
3127 for (; w < wchar_end; ++two_bytes, ++w)
3128 *w = *two_bytes;
3129 /* null-terminate the wstr */
3130 *w = 0;
3131#else
3132 /* sizeof(wchar_t) == 2 */
3133 PyObject_FREE(_PyUnicode_WSTR(u));
3134 _PyUnicode_WSTR(u) = NULL;
3135 Py_FatalError("Impossible unicode object state, wstr "
3136 "and str should share memory already.");
3137 return NULL;
3138#endif
3139 }
3140 else {
3141 assert(0 && "This should never happen.");
3142 }
3143 }
3144 }
3145 if (size != NULL)
3146 *size = PyUnicode_WSTR_LENGTH(u);
3147 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150Py_UNICODE *
3151PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003153 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154}
3155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003156
Alexander Belopolsky40018472011-02-26 01:02:56 +00003157Py_ssize_t
3158PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159{
3160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
3162 goto onError;
3163 }
3164 return PyUnicode_GET_SIZE(unicode);
3165
Benjamin Peterson29060642009-01-31 22:14:21 +00003166 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 return -1;
3168}
3169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003170Py_ssize_t
3171PyUnicode_GetLength(PyObject *unicode)
3172{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003173 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003174 PyErr_BadArgument();
3175 return -1;
3176 }
3177
3178 return PyUnicode_GET_LENGTH(unicode);
3179}
3180
3181Py_UCS4
3182PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3183{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003184 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3185 PyErr_BadArgument();
3186 return (Py_UCS4)-1;
3187 }
3188 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3189 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003190 return (Py_UCS4)-1;
3191 }
3192 return PyUnicode_READ_CHAR(unicode, index);
3193}
3194
3195int
3196PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3197{
3198 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003199 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003200 return -1;
3201 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003202 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3203 PyErr_SetString(PyExc_IndexError, "string index out of range");
3204 return -1;
3205 }
3206 if (_PyUnicode_Dirty(unicode))
3207 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003208 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3209 index, ch);
3210 return 0;
3211}
3212
Alexander Belopolsky40018472011-02-26 01:02:56 +00003213const char *
3214PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003215{
Victor Stinner42cb4622010-09-01 19:39:01 +00003216 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003217}
3218
Victor Stinner554f3f02010-06-16 23:33:54 +00003219/* create or adjust a UnicodeDecodeError */
3220static void
3221make_decode_exception(PyObject **exceptionObject,
3222 const char *encoding,
3223 const char *input, Py_ssize_t length,
3224 Py_ssize_t startpos, Py_ssize_t endpos,
3225 const char *reason)
3226{
3227 if (*exceptionObject == NULL) {
3228 *exceptionObject = PyUnicodeDecodeError_Create(
3229 encoding, input, length, startpos, endpos, reason);
3230 }
3231 else {
3232 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3233 goto onError;
3234 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3235 goto onError;
3236 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3237 goto onError;
3238 }
3239 return;
3240
3241onError:
3242 Py_DECREF(*exceptionObject);
3243 *exceptionObject = NULL;
3244}
3245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003246/* error handling callback helper:
3247 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003248 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003249 and adjust various state variables.
3250 return 0 on success, -1 on error
3251*/
3252
Alexander Belopolsky40018472011-02-26 01:02:56 +00003253static int
3254unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003255 const char *encoding, const char *reason,
3256 const char **input, const char **inend, Py_ssize_t *startinpos,
3257 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3258 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003260 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261
3262 PyObject *restuple = NULL;
3263 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003264 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003265 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003266 Py_ssize_t requiredsize;
3267 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003268 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003269 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003270 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003271 int res = -1;
3272
3273 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 *errorHandler = PyCodec_LookupError(errors);
3275 if (*errorHandler == NULL)
3276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003277 }
3278
Victor Stinner554f3f02010-06-16 23:33:54 +00003279 make_decode_exception(exceptionObject,
3280 encoding,
3281 *input, *inend - *input,
3282 *startinpos, *endinpos,
3283 reason);
3284 if (*exceptionObject == NULL)
3285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286
3287 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3288 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003290 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003291 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 }
3294 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003295 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003296
3297 /* Copy back the bytes variables, which might have been modified by the
3298 callback */
3299 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3300 if (!inputobj)
3301 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003302 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003303 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003304 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003305 *input = PyBytes_AS_STRING(inputobj);
3306 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003307 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003308 /* we can DECREF safely, as the exception has another reference,
3309 so the object won't go away. */
3310 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003314 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003315 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3316 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003317 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318
3319 /* need more space? (at least enough for what we
3320 have+the replacement+the rest of the string (starting
3321 at the new input position), so we won't have to check space
3322 when there are no errors in the rest of the string) */
3323 repptr = PyUnicode_AS_UNICODE(repunicode);
3324 repsize = PyUnicode_GET_SIZE(repunicode);
3325 requiredsize = *outpos + repsize + insize-newpos;
3326 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003327 if (requiredsize<2*outsize)
3328 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003329 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003330 goto onError;
3331 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003332 }
3333 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003334 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003335 Py_UNICODE_COPY(*outptr, repptr, repsize);
3336 *outptr += repsize;
3337 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003339 /* we made it! */
3340 res = 0;
3341
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003343 Py_XDECREF(restuple);
3344 return res;
3345}
3346
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003347/* --- UTF-7 Codec -------------------------------------------------------- */
3348
Antoine Pitrou244651a2009-05-04 18:56:13 +00003349/* See RFC2152 for details. We encode conservatively and decode liberally. */
3350
3351/* Three simple macros defining base-64. */
3352
3353/* Is c a base-64 character? */
3354
3355#define IS_BASE64(c) \
3356 (((c) >= 'A' && (c) <= 'Z') || \
3357 ((c) >= 'a' && (c) <= 'z') || \
3358 ((c) >= '0' && (c) <= '9') || \
3359 (c) == '+' || (c) == '/')
3360
3361/* given that c is a base-64 character, what is its base-64 value? */
3362
3363#define FROM_BASE64(c) \
3364 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3365 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3366 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3367 (c) == '+' ? 62 : 63)
3368
3369/* What is the base-64 character of the bottom 6 bits of n? */
3370
3371#define TO_BASE64(n) \
3372 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3373
3374/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3375 * decoded as itself. We are permissive on decoding; the only ASCII
3376 * byte not decoding to itself is the + which begins a base64
3377 * string. */
3378
3379#define DECODE_DIRECT(c) \
3380 ((c) <= 127 && (c) != '+')
3381
3382/* The UTF-7 encoder treats ASCII characters differently according to
3383 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3384 * the above). See RFC2152. This array identifies these different
3385 * sets:
3386 * 0 : "Set D"
3387 * alphanumeric and '(),-./:?
3388 * 1 : "Set O"
3389 * !"#$%&*;<=>@[]^_`{|}
3390 * 2 : "whitespace"
3391 * ht nl cr sp
3392 * 3 : special (must be base64 encoded)
3393 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3394 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003395
Tim Petersced69f82003-09-16 20:30:58 +00003396static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003397char utf7_category[128] = {
3398/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3399 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3400/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3401 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3402/* sp ! " # $ % & ' ( ) * + , - . / */
3403 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3404/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3405 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3406/* @ A B C D E F G H I J K L M N O */
3407 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3408/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3409 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3410/* ` a b c d e f g h i j k l m n o */
3411 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3412/* p q r s t u v w x y z { | } ~ del */
3413 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414};
3415
Antoine Pitrou244651a2009-05-04 18:56:13 +00003416/* ENCODE_DIRECT: this character should be encoded as itself. The
3417 * answer depends on whether we are encoding set O as itself, and also
3418 * on whether we are encoding whitespace as itself. RFC2152 makes it
3419 * clear that the answers to these questions vary between
3420 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003421
Antoine Pitrou244651a2009-05-04 18:56:13 +00003422#define ENCODE_DIRECT(c, directO, directWS) \
3423 ((c) < 128 && (c) > 0 && \
3424 ((utf7_category[(c)] == 0) || \
3425 (directWS && (utf7_category[(c)] == 2)) || \
3426 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003427
Alexander Belopolsky40018472011-02-26 01:02:56 +00003428PyObject *
3429PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003430 Py_ssize_t size,
3431 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003432{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003433 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3434}
3435
Antoine Pitrou244651a2009-05-04 18:56:13 +00003436/* The decoder. The only state we preserve is our read position,
3437 * i.e. how many characters we have consumed. So if we end in the
3438 * middle of a shift sequence we have to back off the read position
3439 * and the output to the beginning of the sequence, otherwise we lose
3440 * all the shift state (seen bits, number of bits seen, high
3441 * surrogate). */
3442
Alexander Belopolsky40018472011-02-26 01:02:56 +00003443PyObject *
3444PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003445 Py_ssize_t size,
3446 const char *errors,
3447 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003449 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003450 Py_ssize_t startinpos;
3451 Py_ssize_t endinpos;
3452 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003453 const char *e;
3454 PyUnicodeObject *unicode;
3455 Py_UNICODE *p;
3456 const char *errmsg = "";
3457 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003458 Py_UNICODE *shiftOutStart;
3459 unsigned int base64bits = 0;
3460 unsigned long base64buffer = 0;
3461 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003462 PyObject *errorHandler = NULL;
3463 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003464
3465 unicode = _PyUnicode_New(size);
3466 if (!unicode)
3467 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003468 if (size == 0) {
3469 if (consumed)
3470 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003471 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003475 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003476 e = s + size;
3477
3478 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003480 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003481 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003482
Antoine Pitrou244651a2009-05-04 18:56:13 +00003483 if (inShift) { /* in a base-64 section */
3484 if (IS_BASE64(ch)) { /* consume a base-64 character */
3485 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3486 base64bits += 6;
3487 s++;
3488 if (base64bits >= 16) {
3489 /* we have enough bits for a UTF-16 value */
3490 Py_UNICODE outCh = (Py_UNICODE)
3491 (base64buffer >> (base64bits-16));
3492 base64bits -= 16;
3493 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3494 if (surrogate) {
3495 /* expecting a second surrogate */
3496 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3497#ifdef Py_UNICODE_WIDE
3498 *p++ = (((surrogate & 0x3FF)<<10)
3499 | (outCh & 0x3FF)) + 0x10000;
3500#else
3501 *p++ = surrogate;
3502 *p++ = outCh;
3503#endif
3504 surrogate = 0;
3505 }
3506 else {
3507 surrogate = 0;
3508 errmsg = "second surrogate missing";
3509 goto utf7Error;
3510 }
3511 }
3512 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3513 /* first surrogate */
3514 surrogate = outCh;
3515 }
3516 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3517 errmsg = "unexpected second surrogate";
3518 goto utf7Error;
3519 }
3520 else {
3521 *p++ = outCh;
3522 }
3523 }
3524 }
3525 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003526 inShift = 0;
3527 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003528 if (surrogate) {
3529 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003530 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003531 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003532 if (base64bits > 0) { /* left-over bits */
3533 if (base64bits >= 6) {
3534 /* We've seen at least one base-64 character */
3535 errmsg = "partial character in shift sequence";
3536 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003538 else {
3539 /* Some bits remain; they should be zero */
3540 if (base64buffer != 0) {
3541 errmsg = "non-zero padding bits in shift sequence";
3542 goto utf7Error;
3543 }
3544 }
3545 }
3546 if (ch != '-') {
3547 /* '-' is absorbed; other terminating
3548 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003549 *p++ = ch;
3550 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003551 }
3552 }
3553 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003555 s++; /* consume '+' */
3556 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003557 s++;
3558 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003559 }
3560 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003562 shiftOutStart = p;
3563 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003564 }
3565 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003566 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567 *p++ = ch;
3568 s++;
3569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003570 else {
3571 startinpos = s-starts;
3572 s++;
3573 errmsg = "unexpected special character";
3574 goto utf7Error;
3575 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003576 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003577utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003578 outpos = p-PyUnicode_AS_UNICODE(unicode);
3579 endinpos = s-starts;
3580 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 errors, &errorHandler,
3582 "utf7", errmsg,
3583 &starts, &e, &startinpos, &endinpos, &exc, &s,
3584 &unicode, &outpos, &p))
3585 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003586 }
3587
Antoine Pitrou244651a2009-05-04 18:56:13 +00003588 /* end of string */
3589
3590 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3591 /* if we're in an inconsistent state, that's an error */
3592 if (surrogate ||
3593 (base64bits >= 6) ||
3594 (base64bits > 0 && base64buffer != 0)) {
3595 outpos = p-PyUnicode_AS_UNICODE(unicode);
3596 endinpos = size;
3597 if (unicode_decode_call_errorhandler(
3598 errors, &errorHandler,
3599 "utf7", "unterminated shift sequence",
3600 &starts, &e, &startinpos, &endinpos, &exc, &s,
3601 &unicode, &outpos, &p))
3602 goto onError;
3603 if (s < e)
3604 goto restart;
3605 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607
3608 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003609 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003610 if (inShift) {
3611 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003612 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613 }
3614 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003615 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003616 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003617 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003618
Victor Stinnerfe226c02011-10-03 03:52:20 +02003619 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 goto onError;
3621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003624 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003625 Py_DECREF(unicode);
3626 return NULL;
3627 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003628 return (PyObject *)unicode;
3629
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 Py_XDECREF(errorHandler);
3632 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003633 Py_DECREF(unicode);
3634 return NULL;
3635}
3636
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638PyObject *
3639PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 Py_ssize_t size,
3641 int base64SetO,
3642 int base64WhiteSpace,
3643 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003644{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003645 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003646 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003647 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003648 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003649 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003650 unsigned int base64bits = 0;
3651 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003652 char * out;
3653 char * start;
3654
3655 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003657
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003658 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003659 return PyErr_NoMemory();
3660
Antoine Pitrou244651a2009-05-04 18:56:13 +00003661 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003662 if (v == NULL)
3663 return NULL;
3664
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003665 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003666 for (;i < size; ++i) {
3667 Py_UNICODE ch = s[i];
3668
Antoine Pitrou244651a2009-05-04 18:56:13 +00003669 if (inShift) {
3670 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3671 /* shifting out */
3672 if (base64bits) { /* output remaining bits */
3673 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3674 base64buffer = 0;
3675 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003676 }
3677 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003678 /* Characters not in the BASE64 set implicitly unshift the sequence
3679 so no '-' is required, except if the character is itself a '-' */
3680 if (IS_BASE64(ch) || ch == '-') {
3681 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003682 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003683 *out++ = (char) ch;
3684 }
3685 else {
3686 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003687 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003688 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003689 else { /* not in a shift sequence */
3690 if (ch == '+') {
3691 *out++ = '+';
3692 *out++ = '-';
3693 }
3694 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3695 *out++ = (char) ch;
3696 }
3697 else {
3698 *out++ = '+';
3699 inShift = 1;
3700 goto encode_char;
3701 }
3702 }
3703 continue;
3704encode_char:
3705#ifdef Py_UNICODE_WIDE
3706 if (ch >= 0x10000) {
3707 /* code first surrogate */
3708 base64bits += 16;
3709 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3710 while (base64bits >= 6) {
3711 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3712 base64bits -= 6;
3713 }
3714 /* prepare second surrogate */
3715 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3716 }
3717#endif
3718 base64bits += 16;
3719 base64buffer = (base64buffer << 16) | ch;
3720 while (base64bits >= 6) {
3721 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3722 base64bits -= 6;
3723 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003724 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003725 if (base64bits)
3726 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3727 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003728 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003729 if (_PyBytes_Resize(&v, out - start) < 0)
3730 return NULL;
3731 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003732}
3733
Antoine Pitrou244651a2009-05-04 18:56:13 +00003734#undef IS_BASE64
3735#undef FROM_BASE64
3736#undef TO_BASE64
3737#undef DECODE_DIRECT
3738#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740/* --- UTF-8 Codec -------------------------------------------------------- */
3741
Tim Petersced69f82003-09-16 20:30:58 +00003742static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003744 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3745 illegal prefix. See RFC 3629 for details */
3746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3752 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003753 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3758 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3759 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3760 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3761 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762};
3763
Alexander Belopolsky40018472011-02-26 01:02:56 +00003764PyObject *
3765PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003766 Py_ssize_t size,
3767 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768{
Walter Dörwald69652032004-09-07 20:24:22 +00003769 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3770}
3771
Antoine Pitrouab868312009-01-10 15:40:25 +00003772/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3773#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3774
3775/* Mask to quickly check whether a C 'long' contains a
3776 non-ASCII, UTF8-encoded char. */
3777#if (SIZEOF_LONG == 8)
3778# define ASCII_CHAR_MASK 0x8080808080808080L
3779#elif (SIZEOF_LONG == 4)
3780# define ASCII_CHAR_MASK 0x80808080L
3781#else
3782# error C 'long' size should be either 4 or 8!
3783#endif
3784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785/* Scans a UTF-8 string and returns the maximum character to be expected,
3786 the size of the decoded unicode string and if any major errors were
3787 encountered.
3788
3789 This function does check basic UTF-8 sanity, it does however NOT CHECK
3790 if the string contains surrogates, and if all continuation bytes are
3791 within the correct ranges, these checks are performed in
3792 PyUnicode_DecodeUTF8Stateful.
3793
3794 If it sets has_errors to 1, it means the value of unicode_size and max_char
3795 will be bogus and you should not rely on useful information in them.
3796 */
3797static Py_UCS4
3798utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3799 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3800 int *has_errors)
3801{
3802 Py_ssize_t n;
3803 Py_ssize_t char_count = 0;
3804 Py_UCS4 max_char = 127, new_max;
3805 Py_UCS4 upper_bound;
3806 const unsigned char *p = (const unsigned char *)s;
3807 const unsigned char *end = p + string_size;
3808 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3809 int err = 0;
3810
3811 for (; p < end && !err; ++p, ++char_count) {
3812 /* Only check value if it's not a ASCII char... */
3813 if (*p < 0x80) {
3814 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3815 an explanation. */
3816 if (!((size_t) p & LONG_PTR_MASK)) {
3817 /* Help register allocation */
3818 register const unsigned char *_p = p;
3819 while (_p < aligned_end) {
3820 unsigned long value = *(unsigned long *) _p;
3821 if (value & ASCII_CHAR_MASK)
3822 break;
3823 _p += SIZEOF_LONG;
3824 char_count += SIZEOF_LONG;
3825 }
3826 p = _p;
3827 if (p == end)
3828 break;
3829 }
3830 }
3831 if (*p >= 0x80) {
3832 n = utf8_code_length[*p];
3833 new_max = max_char;
3834 switch (n) {
3835 /* invalid start byte */
3836 case 0:
3837 err = 1;
3838 break;
3839 case 2:
3840 /* Code points between 0x00FF and 0x07FF inclusive.
3841 Approximate the upper bound of the code point,
3842 if this flips over 255 we can be sure it will be more
3843 than 255 and the string will need 2 bytes per code coint,
3844 if it stays under or equal to 255, we can be sure 1 byte
3845 is enough.
3846 ((*p & 0b00011111) << 6) | 0b00111111 */
3847 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3848 if (max_char < upper_bound)
3849 new_max = upper_bound;
3850 /* Ensure we track at least that we left ASCII space. */
3851 if (new_max < 128)
3852 new_max = 128;
3853 break;
3854 case 3:
3855 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3856 always > 255 and <= 65535 and will always need 2 bytes. */
3857 if (max_char < 65535)
3858 new_max = 65535;
3859 break;
3860 case 4:
3861 /* Code point will be above 0xFFFF for sure in this case. */
3862 new_max = 65537;
3863 break;
3864 /* Internal error, this should be caught by the first if */
3865 case 1:
3866 default:
3867 assert(0 && "Impossible case in utf8_max_char_and_size");
3868 err = 1;
3869 }
3870 /* Instead of number of overall bytes for this code point,
3871 n containts the number of following bytes: */
3872 --n;
3873 /* Check if the follow up chars are all valid continuation bytes */
3874 if (n >= 1) {
3875 const unsigned char *cont;
3876 if ((p + n) >= end) {
3877 if (consumed == 0)
3878 /* incomplete data, non-incremental decoding */
3879 err = 1;
3880 break;
3881 }
3882 for (cont = p + 1; cont < (p + n); ++cont) {
3883 if ((*cont & 0xc0) != 0x80) {
3884 err = 1;
3885 break;
3886 }
3887 }
3888 p += n;
3889 }
3890 else
3891 err = 1;
3892 max_char = new_max;
3893 }
3894 }
3895
3896 if (unicode_size)
3897 *unicode_size = char_count;
3898 if (has_errors)
3899 *has_errors = err;
3900 return max_char;
3901}
3902
3903/* Similar to PyUnicode_WRITE but can also write into wstr field
3904 of the legacy unicode representation */
3905#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3906 do { \
3907 const int k_ = (kind); \
3908 if (k_ == PyUnicode_WCHAR_KIND) \
3909 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3910 else if (k_ == PyUnicode_1BYTE_KIND) \
3911 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3912 else if (k_ == PyUnicode_2BYTE_KIND) \
3913 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3914 else \
3915 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3916 } while (0)
3917
Alexander Belopolsky40018472011-02-26 01:02:56 +00003918PyObject *
3919PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 Py_ssize_t size,
3921 const char *errors,
3922 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003923{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003926 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003927 Py_ssize_t startinpos;
3928 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003929 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003931 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003932 PyObject *errorHandler = NULL;
3933 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934 Py_UCS4 maxchar = 0;
3935 Py_ssize_t unicode_size;
3936 Py_ssize_t i;
3937 int kind;
3938 void *data;
3939 int has_errors;
3940 Py_UNICODE *error_outptr;
3941#if SIZEOF_WCHAR_T == 2
3942 Py_ssize_t wchar_offset = 0;
3943#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Walter Dörwald69652032004-09-07 20:24:22 +00003945 if (size == 0) {
3946 if (consumed)
3947 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3951 consumed, &has_errors);
3952 if (has_errors) {
3953 unicode = _PyUnicode_New(size);
3954 if (!unicode)
3955 return NULL;
3956 kind = PyUnicode_WCHAR_KIND;
3957 data = PyUnicode_AS_UNICODE(unicode);
3958 assert(data != NULL);
3959 }
3960 else {
3961 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3962 if (!unicode)
3963 return NULL;
3964 /* When the string is ASCII only, just use memcpy and return.
3965 unicode_size may be != size if there is an incomplete UTF-8
3966 sequence at the end of the ASCII block. */
3967 if (maxchar < 128 && size == unicode_size) {
3968 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3969 return (PyObject *)unicode;
3970 }
3971 kind = PyUnicode_KIND(unicode);
3972 data = PyUnicode_DATA(unicode);
3973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003977 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978
3979 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003980 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981
3982 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003983 /* Fast path for runs of ASCII characters. Given that common UTF-8
3984 input will consist of an overwhelming majority of ASCII
3985 characters, we try to optimize for this case by checking
3986 as many characters as a C 'long' can contain.
3987 First, check if we can do an aligned read, as most CPUs have
3988 a penalty for unaligned reads.
3989 */
3990 if (!((size_t) s & LONG_PTR_MASK)) {
3991 /* Help register allocation */
3992 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003994 while (_s < aligned_end) {
3995 /* Read a whole long at a time (either 4 or 8 bytes),
3996 and do a fast unrolled copy if it only contains ASCII
3997 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 unsigned long value = *(unsigned long *) _s;
3999 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004000 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004001 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4002 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4003 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4004 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004005#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4007 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4008 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4009 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004010#endif
4011 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004013 }
4014 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004016 if (s == e)
4017 break;
4018 ch = (unsigned char)*s;
4019 }
4020 }
4021
4022 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004024 s++;
4025 continue;
4026 }
4027
4028 n = utf8_code_length[ch];
4029
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004030 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 if (consumed)
4032 break;
4033 else {
4034 errmsg = "unexpected end of data";
4035 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004036 endinpos = startinpos+1;
4037 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4038 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 goto utf8Error;
4040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042
4043 switch (n) {
4044
4045 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004046 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 startinpos = s-starts;
4048 endinpos = startinpos+1;
4049 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050
4051 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004052 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 startinpos = s-starts;
4054 endinpos = startinpos+1;
4055 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
4057 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004058 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004059 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004061 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 goto utf8Error;
4063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004065 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 break;
4068
4069 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004070 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4071 will result in surrogates in range d800-dfff. Surrogates are
4072 not valid UTF-8 so they are rejected.
4073 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4074 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004075 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004076 (s[2] & 0xc0) != 0x80 ||
4077 ((unsigned char)s[0] == 0xE0 &&
4078 (unsigned char)s[1] < 0xA0) ||
4079 ((unsigned char)s[0] == 0xED &&
4080 (unsigned char)s[1] > 0x9F)) {
4081 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004082 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004083 endinpos = startinpos + 1;
4084
4085 /* if s[1] first two bits are 1 and 0, then the invalid
4086 continuation byte is s[2], so increment endinpos by 1,
4087 if not, s[1] is invalid and endinpos doesn't need to
4088 be incremented. */
4089 if ((s[1] & 0xC0) == 0x80)
4090 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 goto utf8Error;
4092 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004094 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004096 break;
4097
4098 case 4:
4099 if ((s[1] & 0xc0) != 0x80 ||
4100 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004101 (s[3] & 0xc0) != 0x80 ||
4102 ((unsigned char)s[0] == 0xF0 &&
4103 (unsigned char)s[1] < 0x90) ||
4104 ((unsigned char)s[0] == 0xF4 &&
4105 (unsigned char)s[1] > 0x8F)) {
4106 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004108 endinpos = startinpos + 1;
4109 if ((s[1] & 0xC0) == 0x80) {
4110 endinpos++;
4111 if ((s[2] & 0xC0) == 0x80)
4112 endinpos++;
4113 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004114 goto utf8Error;
4115 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004116 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004117 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4118 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 /* If the string is flexible or we have native UCS-4, write
4121 directly.. */
4122 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4123 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 else {
4126 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 /* translate from 10000..10FFFF to 0..FFFF */
4129 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 /* high surrogate = top 10 bits added to D800 */
4132 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4133 (Py_UNICODE)(0xD800 + (ch >> 10)));
4134
4135 /* low surrogate = bottom 10 bits added to DC00 */
4136 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4137 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4138 }
4139#if SIZEOF_WCHAR_T == 2
4140 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004141#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 }
4144 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004146
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148 /* If this is not yet a resizable string, make it one.. */
4149 if (kind != PyUnicode_WCHAR_KIND) {
4150 const Py_UNICODE *u;
4151 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4152 if (!new_unicode)
4153 goto onError;
4154 u = PyUnicode_AsUnicode((PyObject *)unicode);
4155 if (!u)
4156 goto onError;
4157#if SIZEOF_WCHAR_T == 2
4158 i += wchar_offset;
4159#endif
4160 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4161 Py_DECREF(unicode);
4162 unicode = new_unicode;
4163 kind = 0;
4164 data = PyUnicode_AS_UNICODE(new_unicode);
4165 assert(data != NULL);
4166 }
4167 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 if (unicode_decode_call_errorhandler(
4169 errors, &errorHandler,
4170 "utf8", errmsg,
4171 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004174 /* Update data because unicode_decode_call_errorhandler might have
4175 re-created or resized the unicode object. */
4176 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004177 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004179 /* Ensure the unicode_size calculation above was correct: */
4180 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4181
Walter Dörwald69652032004-09-07 20:24:22 +00004182 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185 /* Adjust length and ready string when it contained errors and
4186 is of the old resizable kind. */
4187 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004188 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004189 goto onError;
4190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 Py_XDECREF(errorHandler);
4193 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004194 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 Py_DECREF(unicode);
4196 return NULL;
4197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 return (PyObject *)unicode;
4199
Benjamin Peterson29060642009-01-31 22:14:21 +00004200 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 Py_XDECREF(errorHandler);
4202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 Py_DECREF(unicode);
4204 return NULL;
4205}
4206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004208
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004209#ifdef __APPLE__
4210
4211/* Simplified UTF-8 decoder using surrogateescape error handler,
4212 used to decode the command line arguments on Mac OS X. */
4213
4214wchar_t*
4215_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4216{
4217 int n;
4218 const char *e;
4219 wchar_t *unicode, *p;
4220
4221 /* Note: size will always be longer than the resulting Unicode
4222 character count */
4223 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4224 PyErr_NoMemory();
4225 return NULL;
4226 }
4227 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4228 if (!unicode)
4229 return NULL;
4230
4231 /* Unpack UTF-8 encoded data */
4232 p = unicode;
4233 e = s + size;
4234 while (s < e) {
4235 Py_UCS4 ch = (unsigned char)*s;
4236
4237 if (ch < 0x80) {
4238 *p++ = (wchar_t)ch;
4239 s++;
4240 continue;
4241 }
4242
4243 n = utf8_code_length[ch];
4244 if (s + n > e) {
4245 goto surrogateescape;
4246 }
4247
4248 switch (n) {
4249 case 0:
4250 case 1:
4251 goto surrogateescape;
4252
4253 case 2:
4254 if ((s[1] & 0xc0) != 0x80)
4255 goto surrogateescape;
4256 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4257 assert ((ch > 0x007F) && (ch <= 0x07FF));
4258 *p++ = (wchar_t)ch;
4259 break;
4260
4261 case 3:
4262 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4263 will result in surrogates in range d800-dfff. Surrogates are
4264 not valid UTF-8 so they are rejected.
4265 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4266 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4267 if ((s[1] & 0xc0) != 0x80 ||
4268 (s[2] & 0xc0) != 0x80 ||
4269 ((unsigned char)s[0] == 0xE0 &&
4270 (unsigned char)s[1] < 0xA0) ||
4271 ((unsigned char)s[0] == 0xED &&
4272 (unsigned char)s[1] > 0x9F)) {
4273
4274 goto surrogateescape;
4275 }
4276 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4277 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004279 break;
4280
4281 case 4:
4282 if ((s[1] & 0xc0) != 0x80 ||
4283 (s[2] & 0xc0) != 0x80 ||
4284 (s[3] & 0xc0) != 0x80 ||
4285 ((unsigned char)s[0] == 0xF0 &&
4286 (unsigned char)s[1] < 0x90) ||
4287 ((unsigned char)s[0] == 0xF4 &&
4288 (unsigned char)s[1] > 0x8F)) {
4289 goto surrogateescape;
4290 }
4291 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4292 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4293 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4294
4295#if SIZEOF_WCHAR_T == 4
4296 *p++ = (wchar_t)ch;
4297#else
4298 /* compute and append the two surrogates: */
4299
4300 /* translate from 10000..10FFFF to 0..FFFF */
4301 ch -= 0x10000;
4302
4303 /* high surrogate = top 10 bits added to D800 */
4304 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4305
4306 /* low surrogate = bottom 10 bits added to DC00 */
4307 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4308#endif
4309 break;
4310 }
4311 s += n;
4312 continue;
4313
4314 surrogateescape:
4315 *p++ = 0xDC00 + ch;
4316 s++;
4317 }
4318 *p = L'\0';
4319 return unicode;
4320}
4321
4322#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004324/* Primary internal function which creates utf8 encoded bytes objects.
4325
4326 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004327 and allocate exactly as much space needed at the end. Else allocate the
4328 maximum possible needed (4 result bytes per Unicode character), and return
4329 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004330*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004331PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333{
Tim Peters602f7402002-04-27 18:03:26 +00004334#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004335
Guido van Rossum98297ee2007-11-06 21:34:58 +00004336 Py_ssize_t i; /* index into s of next input byte */
4337 PyObject *result; /* result string object */
4338 char *p; /* next free byte in output buffer */
4339 Py_ssize_t nallocated; /* number of result bytes allocated */
4340 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004341 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004342 PyObject *errorHandler = NULL;
4343 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 int kind;
4345 void *data;
4346 Py_ssize_t size;
4347 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4348#if SIZEOF_WCHAR_T == 2
4349 Py_ssize_t wchar_offset = 0;
4350#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 if (!PyUnicode_Check(unicode)) {
4353 PyErr_BadArgument();
4354 return NULL;
4355 }
4356
4357 if (PyUnicode_READY(unicode) == -1)
4358 return NULL;
4359
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004360 if (PyUnicode_UTF8(unicode))
4361 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4362 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363
4364 kind = PyUnicode_KIND(unicode);
4365 data = PyUnicode_DATA(unicode);
4366 size = PyUnicode_GET_LENGTH(unicode);
4367
Tim Peters602f7402002-04-27 18:03:26 +00004368 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369
Tim Peters602f7402002-04-27 18:03:26 +00004370 if (size <= MAX_SHORT_UNICHARS) {
4371 /* Write into the stack buffer; nallocated can't overflow.
4372 * At the end, we'll allocate exactly as much heap space as it
4373 * turns out we need.
4374 */
4375 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004376 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004377 p = stackbuf;
4378 }
4379 else {
4380 /* Overallocate on the heap, and give the excess back at the end. */
4381 nallocated = size * 4;
4382 if (nallocated / 4 != size) /* overflow! */
4383 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004384 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004385 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004386 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004387 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004388 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004389
Tim Peters602f7402002-04-27 18:03:26 +00004390 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004392
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004393 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004394 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004396
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004398 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004399 *p++ = (char)(0xc0 | (ch >> 6));
4400 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004401 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004402 Py_ssize_t newpos;
4403 PyObject *rep;
4404 Py_ssize_t repsize, k, startpos;
4405 startpos = i-1;
4406#if SIZEOF_WCHAR_T == 2
4407 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004408#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004409 rep = unicode_encode_call_errorhandler(
4410 errors, &errorHandler, "utf-8", "surrogates not allowed",
4411 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4412 &exc, startpos, startpos+1, &newpos);
4413 if (!rep)
4414 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416 if (PyBytes_Check(rep))
4417 repsize = PyBytes_GET_SIZE(rep);
4418 else
4419 repsize = PyUnicode_GET_SIZE(rep);
4420
4421 if (repsize > 4) {
4422 Py_ssize_t offset;
4423
4424 if (result == NULL)
4425 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004426 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004427 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004429 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4430 /* integer overflow */
4431 PyErr_NoMemory();
4432 goto error;
4433 }
4434 nallocated += repsize - 4;
4435 if (result != NULL) {
4436 if (_PyBytes_Resize(&result, nallocated) < 0)
4437 goto error;
4438 } else {
4439 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004440 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 goto error;
4442 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4443 }
4444 p = PyBytes_AS_STRING(result) + offset;
4445 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004447 if (PyBytes_Check(rep)) {
4448 char *prep = PyBytes_AS_STRING(rep);
4449 for(k = repsize; k > 0; k--)
4450 *p++ = *prep++;
4451 } else /* rep is unicode */ {
4452 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4453 Py_UNICODE c;
4454
4455 for(k=0; k<repsize; k++) {
4456 c = prep[k];
4457 if (0x80 <= c) {
4458 raise_encode_exception(&exc, "utf-8",
4459 PyUnicode_AS_UNICODE(unicode),
4460 size, i-1, i,
4461 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004462 goto error;
4463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004465 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004468 } else if (ch < 0x10000) {
4469 *p++ = (char)(0xe0 | (ch >> 12));
4470 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4471 *p++ = (char)(0x80 | (ch & 0x3f));
4472 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004473 /* Encode UCS4 Unicode ordinals */
4474 *p++ = (char)(0xf0 | (ch >> 18));
4475 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4476 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4477 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478#if SIZEOF_WCHAR_T == 2
4479 wchar_offset++;
4480#endif
Tim Peters602f7402002-04-27 18:03:26 +00004481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004483
Guido van Rossum98297ee2007-11-06 21:34:58 +00004484 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004485 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004486 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004487 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004488 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004489 }
4490 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004491 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004492 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004493 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004494 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004496
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004497 Py_XDECREF(errorHandler);
4498 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004499 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004500 error:
4501 Py_XDECREF(errorHandler);
4502 Py_XDECREF(exc);
4503 Py_XDECREF(result);
4504 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004505
Tim Peters602f7402002-04-27 18:03:26 +00004506#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507}
4508
Alexander Belopolsky40018472011-02-26 01:02:56 +00004509PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004510PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4511 Py_ssize_t size,
4512 const char *errors)
4513{
4514 PyObject *v, *unicode;
4515
4516 unicode = PyUnicode_FromUnicode(s, size);
4517 if (unicode == NULL)
4518 return NULL;
4519 v = _PyUnicode_AsUTF8String(unicode, errors);
4520 Py_DECREF(unicode);
4521 return v;
4522}
4523
4524PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004525PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528}
4529
Walter Dörwald41980ca2007-08-16 21:55:45 +00004530/* --- UTF-32 Codec ------------------------------------------------------- */
4531
4532PyObject *
4533PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004534 Py_ssize_t size,
4535 const char *errors,
4536 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004537{
4538 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4539}
4540
4541PyObject *
4542PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 Py_ssize_t size,
4544 const char *errors,
4545 int *byteorder,
4546 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004547{
4548 const char *starts = s;
4549 Py_ssize_t startinpos;
4550 Py_ssize_t endinpos;
4551 Py_ssize_t outpos;
4552 PyUnicodeObject *unicode;
4553 Py_UNICODE *p;
4554#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004555 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004556 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004557#else
4558 const int pairs = 0;
4559#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004560 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004561 int bo = 0; /* assume native ordering by default */
4562 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004563 /* Offsets from q for retrieving bytes in the right order. */
4564#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4565 int iorder[] = {0, 1, 2, 3};
4566#else
4567 int iorder[] = {3, 2, 1, 0};
4568#endif
4569 PyObject *errorHandler = NULL;
4570 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004571
Walter Dörwald41980ca2007-08-16 21:55:45 +00004572 q = (unsigned char *)s;
4573 e = q + size;
4574
4575 if (byteorder)
4576 bo = *byteorder;
4577
4578 /* Check for BOM marks (U+FEFF) in the input and adjust current
4579 byte order setting accordingly. In native mode, the leading BOM
4580 mark is skipped, in all other modes, it is copied to the output
4581 stream as-is (giving a ZWNBSP character). */
4582 if (bo == 0) {
4583 if (size >= 4) {
4584 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004586#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 if (bom == 0x0000FEFF) {
4588 q += 4;
4589 bo = -1;
4590 }
4591 else if (bom == 0xFFFE0000) {
4592 q += 4;
4593 bo = 1;
4594 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004595#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 if (bom == 0x0000FEFF) {
4597 q += 4;
4598 bo = 1;
4599 }
4600 else if (bom == 0xFFFE0000) {
4601 q += 4;
4602 bo = -1;
4603 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004604#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004606 }
4607
4608 if (bo == -1) {
4609 /* force LE */
4610 iorder[0] = 0;
4611 iorder[1] = 1;
4612 iorder[2] = 2;
4613 iorder[3] = 3;
4614 }
4615 else if (bo == 1) {
4616 /* force BE */
4617 iorder[0] = 3;
4618 iorder[1] = 2;
4619 iorder[2] = 1;
4620 iorder[3] = 0;
4621 }
4622
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004623 /* On narrow builds we split characters outside the BMP into two
4624 codepoints => count how much extra space we need. */
4625#ifndef Py_UNICODE_WIDE
4626 for (qq = q; qq < e; qq += 4)
4627 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4628 pairs++;
4629#endif
4630
4631 /* This might be one to much, because of a BOM */
4632 unicode = _PyUnicode_New((size+3)/4+pairs);
4633 if (!unicode)
4634 return NULL;
4635 if (size == 0)
4636 return (PyObject *)unicode;
4637
4638 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004639 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004640
Walter Dörwald41980ca2007-08-16 21:55:45 +00004641 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 Py_UCS4 ch;
4643 /* remaining bytes at the end? (size should be divisible by 4) */
4644 if (e-q<4) {
4645 if (consumed)
4646 break;
4647 errmsg = "truncated data";
4648 startinpos = ((const char *)q)-starts;
4649 endinpos = ((const char *)e)-starts;
4650 goto utf32Error;
4651 /* The remaining input chars are ignored if the callback
4652 chooses to skip the input */
4653 }
4654 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4655 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004656
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 if (ch >= 0x110000)
4658 {
4659 errmsg = "codepoint not in range(0x110000)";
4660 startinpos = ((const char *)q)-starts;
4661 endinpos = startinpos+4;
4662 goto utf32Error;
4663 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004664#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 if (ch >= 0x10000)
4666 {
4667 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4668 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4669 }
4670 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004671#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 *p++ = ch;
4673 q += 4;
4674 continue;
4675 utf32Error:
4676 outpos = p-PyUnicode_AS_UNICODE(unicode);
4677 if (unicode_decode_call_errorhandler(
4678 errors, &errorHandler,
4679 "utf32", errmsg,
4680 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4681 &unicode, &outpos, &p))
4682 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004683 }
4684
4685 if (byteorder)
4686 *byteorder = bo;
4687
4688 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004690
4691 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004692 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004693 goto onError;
4694
4695 Py_XDECREF(errorHandler);
4696 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004697 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004698 Py_DECREF(unicode);
4699 return NULL;
4700 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004701 return (PyObject *)unicode;
4702
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004704 Py_DECREF(unicode);
4705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
4707 return NULL;
4708}
4709
4710PyObject *
4711PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 Py_ssize_t size,
4713 const char *errors,
4714 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004715{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004716 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004717 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004718 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004719#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004720 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004721#else
4722 const int pairs = 0;
4723#endif
4724 /* Offsets from p for storing byte pairs in the right order. */
4725#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4726 int iorder[] = {0, 1, 2, 3};
4727#else
4728 int iorder[] = {3, 2, 1, 0};
4729#endif
4730
Benjamin Peterson29060642009-01-31 22:14:21 +00004731#define STORECHAR(CH) \
4732 do { \
4733 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4734 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4735 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4736 p[iorder[0]] = (CH) & 0xff; \
4737 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004738 } while(0)
4739
4740 /* In narrow builds we can output surrogate pairs as one codepoint,
4741 so we need less space. */
4742#ifndef Py_UNICODE_WIDE
4743 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4745 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4746 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004747#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004748 nsize = (size - pairs + (byteorder == 0));
4749 bytesize = nsize * 4;
4750 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004752 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004753 if (v == NULL)
4754 return NULL;
4755
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004756 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004757 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004759 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004761
4762 if (byteorder == -1) {
4763 /* force LE */
4764 iorder[0] = 0;
4765 iorder[1] = 1;
4766 iorder[2] = 2;
4767 iorder[3] = 3;
4768 }
4769 else if (byteorder == 1) {
4770 /* force BE */
4771 iorder[0] = 3;
4772 iorder[1] = 2;
4773 iorder[2] = 1;
4774 iorder[3] = 0;
4775 }
4776
4777 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4781 Py_UCS4 ch2 = *s;
4782 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4783 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4784 s++;
4785 size--;
4786 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004787 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004788#endif
4789 STORECHAR(ch);
4790 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004791
4792 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004793 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794#undef STORECHAR
4795}
4796
Alexander Belopolsky40018472011-02-26 01:02:56 +00004797PyObject *
4798PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004799{
4800 if (!PyUnicode_Check(unicode)) {
4801 PyErr_BadArgument();
4802 return NULL;
4803 }
4804 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004805 PyUnicode_GET_SIZE(unicode),
4806 NULL,
4807 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004808}
4809
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810/* --- UTF-16 Codec ------------------------------------------------------- */
4811
Tim Peters772747b2001-08-09 22:21:55 +00004812PyObject *
4813PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004814 Py_ssize_t size,
4815 const char *errors,
4816 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817{
Walter Dörwald69652032004-09-07 20:24:22 +00004818 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4819}
4820
Antoine Pitrouab868312009-01-10 15:40:25 +00004821/* Two masks for fast checking of whether a C 'long' may contain
4822 UTF16-encoded surrogate characters. This is an efficient heuristic,
4823 assuming that non-surrogate characters with a code point >= 0x8000 are
4824 rare in most input.
4825 FAST_CHAR_MASK is used when the input is in native byte ordering,
4826 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004827*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004828#if (SIZEOF_LONG == 8)
4829# define FAST_CHAR_MASK 0x8000800080008000L
4830# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4831#elif (SIZEOF_LONG == 4)
4832# define FAST_CHAR_MASK 0x80008000L
4833# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4834#else
4835# error C 'long' size should be either 4 or 8!
4836#endif
4837
Walter Dörwald69652032004-09-07 20:24:22 +00004838PyObject *
4839PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 Py_ssize_t size,
4841 const char *errors,
4842 int *byteorder,
4843 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004846 Py_ssize_t startinpos;
4847 Py_ssize_t endinpos;
4848 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 PyUnicodeObject *unicode;
4850 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004851 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004852 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004853 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004854 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004855 /* Offsets from q for retrieving byte pairs in the right order. */
4856#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4857 int ihi = 1, ilo = 0;
4858#else
4859 int ihi = 0, ilo = 1;
4860#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 PyObject *errorHandler = NULL;
4862 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863
4864 /* Note: size will always be longer than the resulting Unicode
4865 character count */
4866 unicode = _PyUnicode_New(size);
4867 if (!unicode)
4868 return NULL;
4869 if (size == 0)
4870 return (PyObject *)unicode;
4871
4872 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004874 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004875 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876
4877 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004878 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004880 /* Check for BOM marks (U+FEFF) in the input and adjust current
4881 byte order setting accordingly. In native mode, the leading BOM
4882 mark is skipped, in all other modes, it is copied to the output
4883 stream as-is (giving a ZWNBSP character). */
4884 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004885 if (size >= 2) {
4886 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (bom == 0xFEFF) {
4889 q += 2;
4890 bo = -1;
4891 }
4892 else if (bom == 0xFFFE) {
4893 q += 2;
4894 bo = 1;
4895 }
Tim Petersced69f82003-09-16 20:30:58 +00004896#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (bom == 0xFEFF) {
4898 q += 2;
4899 bo = 1;
4900 }
4901 else if (bom == 0xFFFE) {
4902 q += 2;
4903 bo = -1;
4904 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Tim Peters772747b2001-08-09 22:21:55 +00004909 if (bo == -1) {
4910 /* force LE */
4911 ihi = 1;
4912 ilo = 0;
4913 }
4914 else if (bo == 1) {
4915 /* force BE */
4916 ihi = 0;
4917 ilo = 1;
4918 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004919#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4920 native_ordering = ilo < ihi;
4921#else
4922 native_ordering = ilo > ihi;
4923#endif
Tim Peters772747b2001-08-09 22:21:55 +00004924
Antoine Pitrouab868312009-01-10 15:40:25 +00004925 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004926 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004928 /* First check for possible aligned read of a C 'long'. Unaligned
4929 reads are more expensive, better to defer to another iteration. */
4930 if (!((size_t) q & LONG_PTR_MASK)) {
4931 /* Fast path for runs of non-surrogate chars. */
4932 register const unsigned char *_q = q;
4933 Py_UNICODE *_p = p;
4934 if (native_ordering) {
4935 /* Native ordering is simple: as long as the input cannot
4936 possibly contain a surrogate char, do an unrolled copy
4937 of several 16-bit code points to the target object.
4938 The non-surrogate check is done on several input bytes
4939 at a time (as many as a C 'long' can contain). */
4940 while (_q < aligned_end) {
4941 unsigned long data = * (unsigned long *) _q;
4942 if (data & FAST_CHAR_MASK)
4943 break;
4944 _p[0] = ((unsigned short *) _q)[0];
4945 _p[1] = ((unsigned short *) _q)[1];
4946#if (SIZEOF_LONG == 8)
4947 _p[2] = ((unsigned short *) _q)[2];
4948 _p[3] = ((unsigned short *) _q)[3];
4949#endif
4950 _q += SIZEOF_LONG;
4951 _p += SIZEOF_LONG / 2;
4952 }
4953 }
4954 else {
4955 /* Byteswapped ordering is similar, but we must decompose
4956 the copy bytewise, and take care of zero'ing out the
4957 upper bytes if the target object is in 32-bit units
4958 (that is, in UCS-4 builds). */
4959 while (_q < aligned_end) {
4960 unsigned long data = * (unsigned long *) _q;
4961 if (data & SWAPPED_FAST_CHAR_MASK)
4962 break;
4963 /* Zero upper bytes in UCS-4 builds */
4964#if (Py_UNICODE_SIZE > 2)
4965 _p[0] = 0;
4966 _p[1] = 0;
4967#if (SIZEOF_LONG == 8)
4968 _p[2] = 0;
4969 _p[3] = 0;
4970#endif
4971#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004972 /* Issue #4916; UCS-4 builds on big endian machines must
4973 fill the two last bytes of each 4-byte unit. */
4974#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4975# define OFF 2
4976#else
4977# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004978#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004979 ((unsigned char *) _p)[OFF + 1] = _q[0];
4980 ((unsigned char *) _p)[OFF + 0] = _q[1];
4981 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4982 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4983#if (SIZEOF_LONG == 8)
4984 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4985 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4986 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4987 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4988#endif
4989#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004990 _q += SIZEOF_LONG;
4991 _p += SIZEOF_LONG / 2;
4992 }
4993 }
4994 p = _p;
4995 q = _q;
4996 if (q >= e)
4997 break;
4998 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005000
Benjamin Peterson14339b62009-01-31 16:36:08 +00005001 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005002
5003 if (ch < 0xD800 || ch > 0xDFFF) {
5004 *p++ = ch;
5005 continue;
5006 }
5007
5008 /* UTF-16 code pair: */
5009 if (q > e) {
5010 errmsg = "unexpected end of data";
5011 startinpos = (((const char *)q) - 2) - starts;
5012 endinpos = ((const char *)e) + 1 - starts;
5013 goto utf16Error;
5014 }
5015 if (0xD800 <= ch && ch <= 0xDBFF) {
5016 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5017 q += 2;
5018 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005019#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 *p++ = ch;
5021 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005022#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005024#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 continue;
5026 }
5027 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005028 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 startinpos = (((const char *)q)-4)-starts;
5030 endinpos = startinpos+2;
5031 goto utf16Error;
5032 }
5033
Benjamin Peterson14339b62009-01-31 16:36:08 +00005034 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 errmsg = "illegal encoding";
5036 startinpos = (((const char *)q)-2)-starts;
5037 endinpos = startinpos+2;
5038 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005039
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 utf16Error:
5041 outpos = p - PyUnicode_AS_UNICODE(unicode);
5042 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005043 errors,
5044 &errorHandler,
5045 "utf16", errmsg,
5046 &starts,
5047 (const char **)&e,
5048 &startinpos,
5049 &endinpos,
5050 &exc,
5051 (const char **)&q,
5052 &unicode,
5053 &outpos,
5054 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005057 /* remaining byte at the end? (size should be even) */
5058 if (e == q) {
5059 if (!consumed) {
5060 errmsg = "truncated data";
5061 startinpos = ((const char *)q) - starts;
5062 endinpos = ((const char *)e) + 1 - starts;
5063 outpos = p - PyUnicode_AS_UNICODE(unicode);
5064 if (unicode_decode_call_errorhandler(
5065 errors,
5066 &errorHandler,
5067 "utf16", errmsg,
5068 &starts,
5069 (const char **)&e,
5070 &startinpos,
5071 &endinpos,
5072 &exc,
5073 (const char **)&q,
5074 &unicode,
5075 &outpos,
5076 &p))
5077 goto onError;
5078 /* The remaining input chars are ignored if the callback
5079 chooses to skip the input */
5080 }
5081 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
5083 if (byteorder)
5084 *byteorder = bo;
5085
Walter Dörwald69652032004-09-07 20:24:22 +00005086 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005090 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 goto onError;
5092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005093 Py_XDECREF(errorHandler);
5094 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005095 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005096 Py_DECREF(unicode);
5097 return NULL;
5098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 return (PyObject *)unicode;
5100
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 Py_XDECREF(errorHandler);
5104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 return NULL;
5106}
5107
Antoine Pitrouab868312009-01-10 15:40:25 +00005108#undef FAST_CHAR_MASK
5109#undef SWAPPED_FAST_CHAR_MASK
5110
Tim Peters772747b2001-08-09 22:21:55 +00005111PyObject *
5112PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005113 Py_ssize_t size,
5114 const char *errors,
5115 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005118 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005119 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005120#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005121 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005122#else
5123 const int pairs = 0;
5124#endif
Tim Peters772747b2001-08-09 22:21:55 +00005125 /* Offsets from p for storing byte pairs in the right order. */
5126#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5127 int ihi = 1, ilo = 0;
5128#else
5129 int ihi = 0, ilo = 1;
5130#endif
5131
Benjamin Peterson29060642009-01-31 22:14:21 +00005132#define STORECHAR(CH) \
5133 do { \
5134 p[ihi] = ((CH) >> 8) & 0xff; \
5135 p[ilo] = (CH) & 0xff; \
5136 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005137 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005139#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005140 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 if (s[i] >= 0x10000)
5142 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005143#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005144 /* 2 * (size + pairs + (byteorder == 0)) */
5145 if (size > PY_SSIZE_T_MAX ||
5146 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005148 nsize = size + pairs + (byteorder == 0);
5149 bytesize = nsize * 2;
5150 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005152 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005153 if (v == NULL)
5154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005156 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005159 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005160 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005161
5162 if (byteorder == -1) {
5163 /* force LE */
5164 ihi = 1;
5165 ilo = 0;
5166 }
5167 else if (byteorder == 1) {
5168 /* force BE */
5169 ihi = 0;
5170 ilo = 1;
5171 }
5172
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005173 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 Py_UNICODE ch = *s++;
5175 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005176#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005177 if (ch >= 0x10000) {
5178 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5179 ch = 0xD800 | ((ch-0x10000) >> 10);
5180 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005181#endif
Tim Peters772747b2001-08-09 22:21:55 +00005182 STORECHAR(ch);
5183 if (ch2)
5184 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005185 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005186
5187 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005188 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005189#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190}
5191
Alexander Belopolsky40018472011-02-26 01:02:56 +00005192PyObject *
5193PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
5195 if (!PyUnicode_Check(unicode)) {
5196 PyErr_BadArgument();
5197 return NULL;
5198 }
5199 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 PyUnicode_GET_SIZE(unicode),
5201 NULL,
5202 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203}
5204
5205/* --- Unicode Escape Codec ----------------------------------------------- */
5206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005207/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5208 if all the escapes in the string make it still a valid ASCII string.
5209 Returns -1 if any escapes were found which cause the string to
5210 pop out of ASCII range. Otherwise returns the length of the
5211 required buffer to hold the string.
5212 */
5213Py_ssize_t
5214length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5215{
5216 const unsigned char *p = (const unsigned char *)s;
5217 const unsigned char *end = p + size;
5218 Py_ssize_t length = 0;
5219
5220 if (size < 0)
5221 return -1;
5222
5223 for (; p < end; ++p) {
5224 if (*p > 127) {
5225 /* Non-ASCII */
5226 return -1;
5227 }
5228 else if (*p != '\\') {
5229 /* Normal character */
5230 ++length;
5231 }
5232 else {
5233 /* Backslash-escape, check next char */
5234 ++p;
5235 /* Escape sequence reaches till end of string or
5236 non-ASCII follow-up. */
5237 if (p >= end || *p > 127)
5238 return -1;
5239 switch (*p) {
5240 case '\n':
5241 /* backslash + \n result in zero characters */
5242 break;
5243 case '\\': case '\'': case '\"':
5244 case 'b': case 'f': case 't':
5245 case 'n': case 'r': case 'v': case 'a':
5246 ++length;
5247 break;
5248 case '0': case '1': case '2': case '3':
5249 case '4': case '5': case '6': case '7':
5250 case 'x': case 'u': case 'U': case 'N':
5251 /* these do not guarantee ASCII characters */
5252 return -1;
5253 default:
5254 /* count the backslash + the other character */
5255 length += 2;
5256 }
5257 }
5258 }
5259 return length;
5260}
5261
5262/* Similar to PyUnicode_WRITE but either write into wstr field
5263 or treat string as ASCII. */
5264#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5265 do { \
5266 if ((kind) != PyUnicode_WCHAR_KIND) \
5267 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5268 else \
5269 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5270 } while (0)
5271
5272#define WRITE_WSTR(buf, index, value) \
5273 assert(kind == PyUnicode_WCHAR_KIND), \
5274 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5275
5276
Fredrik Lundh06d12682001-01-24 07:59:11 +00005277static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005278
Alexander Belopolsky40018472011-02-26 01:02:56 +00005279PyObject *
5280PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005281 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005282 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005285 Py_ssize_t startinpos;
5286 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005287 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005291 char* message;
5292 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005293 PyObject *errorHandler = NULL;
5294 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005295 Py_ssize_t ascii_length;
5296 Py_ssize_t i;
5297 int kind;
5298 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005300 ascii_length = length_of_escaped_ascii_string(s, size);
5301
5302 /* After length_of_escaped_ascii_string() there are two alternatives,
5303 either the string is pure ASCII with named escapes like \n, etc.
5304 and we determined it's exact size (common case)
5305 or it contains \x, \u, ... escape sequences. then we create a
5306 legacy wchar string and resize it at the end of this function. */
5307 if (ascii_length >= 0) {
5308 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5309 if (!v)
5310 goto onError;
5311 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5312 kind = PyUnicode_1BYTE_KIND;
5313 data = PyUnicode_DATA(v);
5314 }
5315 else {
5316 /* Escaped strings will always be longer than the resulting
5317 Unicode string, so we start with size here and then reduce the
5318 length after conversion to the true value.
5319 (but if the error callback returns a long replacement string
5320 we'll have to allocate more space) */
5321 v = _PyUnicode_New(size);
5322 if (!v)
5323 goto onError;
5324 kind = PyUnicode_WCHAR_KIND;
5325 data = PyUnicode_AS_UNICODE(v);
5326 }
5327
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 if (size == 0)
5329 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005330 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 while (s < end) {
5334 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005335 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005338 if (kind == PyUnicode_WCHAR_KIND) {
5339 assert(i < _PyUnicode_WSTR_LENGTH(v));
5340 }
5341 else {
5342 /* The only case in which i == ascii_length is a backslash
5343 followed by a newline. */
5344 assert(i <= ascii_length);
5345 }
5346
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 /* Non-escape characters are interpreted as Unicode ordinals */
5348 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005349 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 continue;
5351 }
5352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 /* \ - Escapes */
5355 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005356 c = *s++;
5357 if (s > end)
5358 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359
5360 if (kind == PyUnicode_WCHAR_KIND) {
5361 assert(i < _PyUnicode_WSTR_LENGTH(v));
5362 }
5363 else {
5364 /* The only case in which i == ascii_length is a backslash
5365 followed by a newline. */
5366 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5367 }
5368
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005369 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5374 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5375 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5376 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5377 /* FF */
5378 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5379 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5380 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5381 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5382 /* VT */
5383 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5384 /* BEL, not classic C */
5385 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 case '0': case '1': case '2': case '3':
5389 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005390 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005391 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005392 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005393 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005394 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 break;
5398
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 /* hex escapes */
5400 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005402 digits = 2;
5403 message = "truncated \\xXX escape";
5404 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005408 digits = 4;
5409 message = "truncated \\uXXXX escape";
5410 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005413 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005414 digits = 8;
5415 message = "truncated \\UXXXXXXXX escape";
5416 hexescape:
5417 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 if (s+digits>end) {
5420 endinpos = size;
5421 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 errors, &errorHandler,
5423 "unicodeescape", "end of string in escape sequence",
5424 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005425 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 goto nextByte;
5429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430 for (j = 0; j < digits; ++j) {
5431 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005432 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 endinpos = (s+j+1)-starts;
5434 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005435 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005436 errors, &errorHandler,
5437 "unicodeescape", message,
5438 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005440 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005442 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005443 }
5444 chr = (chr<<4) & ~0xF;
5445 if (c >= '0' && c <= '9')
5446 chr += c - '0';
5447 else if (c >= 'a' && c <= 'f')
5448 chr += 10 + c - 'a';
5449 else
5450 chr += 10 + c - 'A';
5451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005453 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 /* _decoding_error will have already written into the
5455 target buffer. */
5456 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005457 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005458 /* when we get here, chr is a 32-bit unicode character */
5459 if (chr <= 0xffff)
5460 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005462 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005463 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005464 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005465#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005467#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005468 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005469 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5470 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005471#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005472 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005473 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005475 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 errors, &errorHandler,
5477 "unicodeescape", "illegal Unicode character",
5478 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005480 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005482 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005483 break;
5484
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005486 case 'N':
5487 message = "malformed \\N character escape";
5488 if (ucnhash_CAPI == NULL) {
5489 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005490 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5491 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 if (ucnhash_CAPI == NULL)
5493 goto ucnhashError;
5494 }
5495 if (*s == '{') {
5496 const char *start = s+1;
5497 /* look for the closing brace */
5498 while (*s != '}' && s < end)
5499 s++;
5500 if (s > start && s < end && *s == '}') {
5501 /* found a name. look it up in the unicode database */
5502 message = "unknown Unicode character name";
5503 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5505 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005506 goto store;
5507 }
5508 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 errors, &errorHandler,
5513 "unicodeescape", message,
5514 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005515 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005516 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005518 break;
5519
5520 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005521 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523 message = "\\ at end of string";
5524 s--;
5525 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005527 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 errors, &errorHandler,
5529 "unicodeescape", message,
5530 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005531 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005532 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005534 }
5535 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5537 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005538 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005539 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005542 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 /* Ensure the length prediction worked in case of ASCII strings */
5545 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5546
Victor Stinnerfe226c02011-10-03 03:52:20 +02005547 if (kind == PyUnicode_WCHAR_KIND)
5548 {
5549 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5550 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005551 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005552 Py_XDECREF(errorHandler);
5553 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005554 if (_PyUnicode_READY_REPLACE(&v)) {
5555 Py_DECREF(v);
5556 return NULL;
5557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005559
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005561 PyErr_SetString(
5562 PyExc_UnicodeError,
5563 "\\N escapes not supported (can't load unicodedata module)"
5564 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005565 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 Py_XDECREF(errorHandler);
5567 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005568 return NULL;
5569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 Py_XDECREF(errorHandler);
5573 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 return NULL;
5575}
5576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577#undef WRITE_ASCII_OR_WSTR
5578#undef WRITE_WSTR
5579
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580/* Return a Unicode-Escape string version of the Unicode object.
5581
5582 If quotes is true, the string is enclosed in u"" or u'' quotes as
5583 appropriate.
5584
5585*/
5586
Walter Dörwald79e913e2007-05-12 11:08:06 +00005587static const char *hexdigits = "0123456789abcdef";
5588
Alexander Belopolsky40018472011-02-26 01:02:56 +00005589PyObject *
5590PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005591 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005593 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005596#ifdef Py_UNICODE_WIDE
5597 const Py_ssize_t expandsize = 10;
5598#else
5599 const Py_ssize_t expandsize = 6;
5600#endif
5601
Thomas Wouters89f507f2006-12-13 04:49:30 +00005602 /* XXX(nnorwitz): rather than over-allocating, it would be
5603 better to choose a different scheme. Perhaps scan the
5604 first N-chars of the string and allocate based on that size.
5605 */
5606 /* Initial allocation is based on the longest-possible unichr
5607 escape.
5608
5609 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5610 unichr, so in this case it's the longest unichr escape. In
5611 narrow (UTF-16) builds this is five chars per source unichr
5612 since there are two unichrs in the surrogate pair, so in narrow
5613 (UTF-16) builds it's not the longest unichr escape.
5614
5615 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5616 so in the narrow (UTF-16) build case it's the longest unichr
5617 escape.
5618 */
5619
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005620 if (size == 0)
5621 return PyBytes_FromStringAndSize(NULL, 0);
5622
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005623 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005625
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005626 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 2
5628 + expandsize*size
5629 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 if (repr == NULL)
5631 return NULL;
5632
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005633 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 while (size-- > 0) {
5636 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005637
Walter Dörwald79e913e2007-05-12 11:08:06 +00005638 /* Escape backslashes */
5639 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 *p++ = '\\';
5641 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005642 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005643 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005644
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005645#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005646 /* Map 21-bit characters to '\U00xxxxxx' */
5647 else if (ch >= 0x10000) {
5648 *p++ = '\\';
5649 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005650 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5651 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5652 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5653 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5654 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5655 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5656 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5657 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005659 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005660#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5662 else if (ch >= 0xD800 && ch < 0xDC00) {
5663 Py_UNICODE ch2;
5664 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005665
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 ch2 = *s++;
5667 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005668 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5670 *p++ = '\\';
5671 *p++ = 'U';
5672 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5673 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5674 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5675 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5676 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5677 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5678 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5679 *p++ = hexdigits[ucs & 0x0000000F];
5680 continue;
5681 }
5682 /* Fall through: isolated surrogates are copied as-is */
5683 s--;
5684 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005685 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005686#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005689 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 *p++ = '\\';
5691 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005692 *p++ = hexdigits[(ch >> 12) & 0x000F];
5693 *p++ = hexdigits[(ch >> 8) & 0x000F];
5694 *p++ = hexdigits[(ch >> 4) & 0x000F];
5695 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005697
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005698 /* Map special whitespace to '\t', \n', '\r' */
5699 else if (ch == '\t') {
5700 *p++ = '\\';
5701 *p++ = 't';
5702 }
5703 else if (ch == '\n') {
5704 *p++ = '\\';
5705 *p++ = 'n';
5706 }
5707 else if (ch == '\r') {
5708 *p++ = '\\';
5709 *p++ = 'r';
5710 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005711
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005712 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005713 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005715 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005716 *p++ = hexdigits[(ch >> 4) & 0x000F];
5717 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005718 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005719
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 /* Copy everything else as-is */
5721 else
5722 *p++ = (char) ch;
5723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005725 assert(p - PyBytes_AS_STRING(repr) > 0);
5726 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5727 return NULL;
5728 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729}
5730
Alexander Belopolsky40018472011-02-26 01:02:56 +00005731PyObject *
5732PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005734 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 if (!PyUnicode_Check(unicode)) {
5736 PyErr_BadArgument();
5737 return NULL;
5738 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005739 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5740 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005741 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742}
5743
5744/* --- Raw Unicode Escape Codec ------------------------------------------- */
5745
Alexander Belopolsky40018472011-02-26 01:02:56 +00005746PyObject *
5747PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005748 Py_ssize_t size,
5749 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005751 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005752 Py_ssize_t startinpos;
5753 Py_ssize_t endinpos;
5754 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 const char *end;
5758 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 PyObject *errorHandler = NULL;
5760 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 /* Escaped strings will always be longer than the resulting
5763 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 length after conversion to the true value. (But decoding error
5765 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 v = _PyUnicode_New(size);
5767 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 end = s + size;
5773 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 unsigned char c;
5775 Py_UCS4 x;
5776 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005777 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 /* Non-escape characters are interpreted as Unicode ordinals */
5780 if (*s != '\\') {
5781 *p++ = (unsigned char)*s++;
5782 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 startinpos = s-starts;
5785
5786 /* \u-escapes are only interpreted iff the number of leading
5787 backslashes if odd */
5788 bs = s;
5789 for (;s < end;) {
5790 if (*s != '\\')
5791 break;
5792 *p++ = (unsigned char)*s++;
5793 }
5794 if (((s - bs) & 1) == 0 ||
5795 s >= end ||
5796 (*s != 'u' && *s != 'U')) {
5797 continue;
5798 }
5799 p--;
5800 count = *s=='u' ? 4 : 8;
5801 s++;
5802
5803 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5804 outpos = p-PyUnicode_AS_UNICODE(v);
5805 for (x = 0, i = 0; i < count; ++i, ++s) {
5806 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005807 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 endinpos = s-starts;
5809 if (unicode_decode_call_errorhandler(
5810 errors, &errorHandler,
5811 "rawunicodeescape", "truncated \\uXXXX",
5812 &starts, &end, &startinpos, &endinpos, &exc, &s,
5813 &v, &outpos, &p))
5814 goto onError;
5815 goto nextByte;
5816 }
5817 x = (x<<4) & ~0xF;
5818 if (c >= '0' && c <= '9')
5819 x += c - '0';
5820 else if (c >= 'a' && c <= 'f')
5821 x += 10 + c - 'a';
5822 else
5823 x += 10 + c - 'A';
5824 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005825 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 /* UCS-2 character */
5827 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005828 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 /* UCS-4 character. Either store directly, or as
5830 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005831#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005833#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 x -= 0x10000L;
5835 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5836 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005837#endif
5838 } else {
5839 endinpos = s-starts;
5840 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005841 if (unicode_decode_call_errorhandler(
5842 errors, &errorHandler,
5843 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 &starts, &end, &startinpos, &endinpos, &exc, &s,
5845 &v, &outpos, &p))
5846 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005847 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 nextByte:
5849 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005851 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005855 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005856 Py_DECREF(v);
5857 return NULL;
5858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005860
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 Py_XDECREF(errorHandler);
5864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 return NULL;
5866}
5867
Alexander Belopolsky40018472011-02-26 01:02:56 +00005868PyObject *
5869PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005870 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 char *p;
5874 char *q;
5875
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005876#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005877 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005878#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005879 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005880#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005881
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005882 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005884
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005885 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 if (repr == NULL)
5887 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005888 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005889 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005891 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 while (size-- > 0) {
5893 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005894#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 /* Map 32-bit characters to '\Uxxxxxxxx' */
5896 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005897 *p++ = '\\';
5898 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005899 *p++ = hexdigits[(ch >> 28) & 0xf];
5900 *p++ = hexdigits[(ch >> 24) & 0xf];
5901 *p++ = hexdigits[(ch >> 20) & 0xf];
5902 *p++ = hexdigits[(ch >> 16) & 0xf];
5903 *p++ = hexdigits[(ch >> 12) & 0xf];
5904 *p++ = hexdigits[(ch >> 8) & 0xf];
5905 *p++ = hexdigits[(ch >> 4) & 0xf];
5906 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005907 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005908 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005909#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5911 if (ch >= 0xD800 && ch < 0xDC00) {
5912 Py_UNICODE ch2;
5913 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 ch2 = *s++;
5916 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005917 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5919 *p++ = '\\';
5920 *p++ = 'U';
5921 *p++ = hexdigits[(ucs >> 28) & 0xf];
5922 *p++ = hexdigits[(ucs >> 24) & 0xf];
5923 *p++ = hexdigits[(ucs >> 20) & 0xf];
5924 *p++ = hexdigits[(ucs >> 16) & 0xf];
5925 *p++ = hexdigits[(ucs >> 12) & 0xf];
5926 *p++ = hexdigits[(ucs >> 8) & 0xf];
5927 *p++ = hexdigits[(ucs >> 4) & 0xf];
5928 *p++ = hexdigits[ucs & 0xf];
5929 continue;
5930 }
5931 /* Fall through: isolated surrogates are copied as-is */
5932 s--;
5933 size++;
5934 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005935#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 /* Map 16-bit characters to '\uxxxx' */
5937 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 *p++ = '\\';
5939 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005940 *p++ = hexdigits[(ch >> 12) & 0xf];
5941 *p++ = hexdigits[(ch >> 8) & 0xf];
5942 *p++ = hexdigits[(ch >> 4) & 0xf];
5943 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 /* Copy everything else as-is */
5946 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947 *p++ = (char) ch;
5948 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005949 size = p - q;
5950
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005951 assert(size > 0);
5952 if (_PyBytes_Resize(&repr, size) < 0)
5953 return NULL;
5954 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955}
5956
Alexander Belopolsky40018472011-02-26 01:02:56 +00005957PyObject *
5958PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005960 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005962 PyErr_BadArgument();
5963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005965 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5966 PyUnicode_GET_SIZE(unicode));
5967
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005968 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969}
5970
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005971/* --- Unicode Internal Codec ------------------------------------------- */
5972
Alexander Belopolsky40018472011-02-26 01:02:56 +00005973PyObject *
5974_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005975 Py_ssize_t size,
5976 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005977{
5978 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005979 Py_ssize_t startinpos;
5980 Py_ssize_t endinpos;
5981 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 PyUnicodeObject *v;
5983 Py_UNICODE *p;
5984 const char *end;
5985 const char *reason;
5986 PyObject *errorHandler = NULL;
5987 PyObject *exc = NULL;
5988
Neal Norwitzd43069c2006-01-08 01:12:10 +00005989#ifdef Py_UNICODE_WIDE
5990 Py_UNICODE unimax = PyUnicode_GetMax();
5991#endif
5992
Thomas Wouters89f507f2006-12-13 04:49:30 +00005993 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005994 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5995 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005997 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5998 as string was created with the old API. */
5999 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006001 p = PyUnicode_AS_UNICODE(v);
6002 end = s + size;
6003
6004 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006005 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006006 /* We have to sanity check the raw data, otherwise doom looms for
6007 some malformed UCS-4 data. */
6008 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006009#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006010 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006011#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006012 end-s < Py_UNICODE_SIZE
6013 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006015 startinpos = s - starts;
6016 if (end-s < Py_UNICODE_SIZE) {
6017 endinpos = end-starts;
6018 reason = "truncated input";
6019 }
6020 else {
6021 endinpos = s - starts + Py_UNICODE_SIZE;
6022 reason = "illegal code point (> 0x10FFFF)";
6023 }
6024 outpos = p - PyUnicode_AS_UNICODE(v);
6025 if (unicode_decode_call_errorhandler(
6026 errors, &errorHandler,
6027 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006028 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006029 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030 goto onError;
6031 }
6032 }
6033 else {
6034 p++;
6035 s += Py_UNICODE_SIZE;
6036 }
6037 }
6038
Victor Stinnerfe226c02011-10-03 03:52:20 +02006039 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006040 goto onError;
6041 Py_XDECREF(errorHandler);
6042 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006043 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006044 Py_DECREF(v);
6045 return NULL;
6046 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 return (PyObject *)v;
6048
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006050 Py_XDECREF(v);
6051 Py_XDECREF(errorHandler);
6052 Py_XDECREF(exc);
6053 return NULL;
6054}
6055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056/* --- Latin-1 Codec ------------------------------------------------------ */
6057
Alexander Belopolsky40018472011-02-26 01:02:56 +00006058PyObject *
6059PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006060 Py_ssize_t size,
6061 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006064 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065}
6066
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006068static void
6069make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006070 const char *encoding,
6071 const Py_UNICODE *unicode, Py_ssize_t size,
6072 Py_ssize_t startpos, Py_ssize_t endpos,
6073 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 *exceptionObject = PyUnicodeEncodeError_Create(
6077 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
6079 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6081 goto onError;
6082 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6083 goto onError;
6084 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6085 goto onError;
6086 return;
6087 onError:
6088 Py_DECREF(*exceptionObject);
6089 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
6091}
6092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006094static void
6095raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006096 const char *encoding,
6097 const Py_UNICODE *unicode, Py_ssize_t size,
6098 Py_ssize_t startpos, Py_ssize_t endpos,
6099 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100{
6101 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105}
6106
6107/* error handling callback helper:
6108 build arguments, call the callback and check the arguments,
6109 put the result into newpos and return the replacement string, which
6110 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111static PyObject *
6112unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006113 PyObject **errorHandler,
6114 const char *encoding, const char *reason,
6115 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6116 Py_ssize_t startpos, Py_ssize_t endpos,
6117 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006119 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120
6121 PyObject *restuple;
6122 PyObject *resunicode;
6123
6124 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 }
6129
6130 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134
6135 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006140 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 Py_DECREF(restuple);
6142 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006144 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 &resunicode, newpos)) {
6146 Py_DECREF(restuple);
6147 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006149 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6150 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6151 Py_DECREF(restuple);
6152 return NULL;
6153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006156 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6158 Py_DECREF(restuple);
6159 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 Py_INCREF(resunicode);
6162 Py_DECREF(restuple);
6163 return resunicode;
6164}
6165
Alexander Belopolsky40018472011-02-26 01:02:56 +00006166static PyObject *
6167unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006168 Py_ssize_t size,
6169 const char *errors,
6170 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171{
6172 /* output object */
6173 PyObject *res;
6174 /* pointers to the beginning and end+1 of input */
6175 const Py_UNICODE *startp = p;
6176 const Py_UNICODE *endp = p + size;
6177 /* pointer to the beginning of the unencodable characters */
6178 /* const Py_UNICODE *badp = NULL; */
6179 /* pointer into the output */
6180 char *str;
6181 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006182 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006183 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6184 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 PyObject *errorHandler = NULL;
6186 PyObject *exc = NULL;
6187 /* the following variable is used for caching string comparisons
6188 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6189 int known_errorHandler = -1;
6190
6191 /* allocate enough for a simple encoding without
6192 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006193 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006194 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006197 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 ressize = size;
6200
6201 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* can we encode this? */
6205 if (c<limit) {
6206 /* no overflow check, because we know that the space is enough */
6207 *str++ = (char)c;
6208 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 else {
6211 Py_ssize_t unicodepos = p-startp;
6212 Py_ssize_t requiredsize;
6213 PyObject *repunicode;
6214 Py_ssize_t repsize;
6215 Py_ssize_t newpos;
6216 Py_ssize_t respos;
6217 Py_UNICODE *uni2;
6218 /* startpos for collecting unencodable chars */
6219 const Py_UNICODE *collstart = p;
6220 const Py_UNICODE *collend = p;
6221 /* find all unecodable characters */
6222 while ((collend < endp) && ((*collend)>=limit))
6223 ++collend;
6224 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6225 if (known_errorHandler==-1) {
6226 if ((errors==NULL) || (!strcmp(errors, "strict")))
6227 known_errorHandler = 1;
6228 else if (!strcmp(errors, "replace"))
6229 known_errorHandler = 2;
6230 else if (!strcmp(errors, "ignore"))
6231 known_errorHandler = 3;
6232 else if (!strcmp(errors, "xmlcharrefreplace"))
6233 known_errorHandler = 4;
6234 else
6235 known_errorHandler = 0;
6236 }
6237 switch (known_errorHandler) {
6238 case 1: /* strict */
6239 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6240 goto onError;
6241 case 2: /* replace */
6242 while (collstart++<collend)
6243 *str++ = '?'; /* fall through */
6244 case 3: /* ignore */
6245 p = collend;
6246 break;
6247 case 4: /* xmlcharrefreplace */
6248 respos = str - PyBytes_AS_STRING(res);
6249 /* determine replacement size (temporarily (mis)uses p) */
6250 for (p = collstart, repsize = 0; p < collend; ++p) {
6251 if (*p<10)
6252 repsize += 2+1+1;
6253 else if (*p<100)
6254 repsize += 2+2+1;
6255 else if (*p<1000)
6256 repsize += 2+3+1;
6257 else if (*p<10000)
6258 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006259#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 else
6261 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006262#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 else if (*p<100000)
6264 repsize += 2+5+1;
6265 else if (*p<1000000)
6266 repsize += 2+6+1;
6267 else
6268 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006269#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 }
6271 requiredsize = respos+repsize+(endp-collend);
6272 if (requiredsize > ressize) {
6273 if (requiredsize<2*ressize)
6274 requiredsize = 2*ressize;
6275 if (_PyBytes_Resize(&res, requiredsize))
6276 goto onError;
6277 str = PyBytes_AS_STRING(res) + respos;
6278 ressize = requiredsize;
6279 }
6280 /* generate replacement (temporarily (mis)uses p) */
6281 for (p = collstart; p < collend; ++p) {
6282 str += sprintf(str, "&#%d;", (int)*p);
6283 }
6284 p = collend;
6285 break;
6286 default:
6287 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6288 encoding, reason, startp, size, &exc,
6289 collstart-startp, collend-startp, &newpos);
6290 if (repunicode == NULL)
6291 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006292 if (PyBytes_Check(repunicode)) {
6293 /* Directly copy bytes result to output. */
6294 repsize = PyBytes_Size(repunicode);
6295 if (repsize > 1) {
6296 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006297 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006298 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6299 Py_DECREF(repunicode);
6300 goto onError;
6301 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006302 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006303 ressize += repsize-1;
6304 }
6305 memcpy(str, PyBytes_AsString(repunicode), repsize);
6306 str += repsize;
6307 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006308 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006309 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006310 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006311 /* need more space? (at least enough for what we
6312 have+the replacement+the rest of the string, so
6313 we won't have to check space for encodable characters) */
6314 respos = str - PyBytes_AS_STRING(res);
6315 repsize = PyUnicode_GET_SIZE(repunicode);
6316 requiredsize = respos+repsize+(endp-collend);
6317 if (requiredsize > ressize) {
6318 if (requiredsize<2*ressize)
6319 requiredsize = 2*ressize;
6320 if (_PyBytes_Resize(&res, requiredsize)) {
6321 Py_DECREF(repunicode);
6322 goto onError;
6323 }
6324 str = PyBytes_AS_STRING(res) + respos;
6325 ressize = requiredsize;
6326 }
6327 /* check if there is anything unencodable in the replacement
6328 and copy it to the output */
6329 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6330 c = *uni2;
6331 if (c >= limit) {
6332 raise_encode_exception(&exc, encoding, startp, size,
6333 unicodepos, unicodepos+1, reason);
6334 Py_DECREF(repunicode);
6335 goto onError;
6336 }
6337 *str = (char)c;
6338 }
6339 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006340 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006342 }
6343 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006344 /* Resize if we allocated to much */
6345 size = str - PyBytes_AS_STRING(res);
6346 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006347 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 if (_PyBytes_Resize(&res, size) < 0)
6349 goto onError;
6350 }
6351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 Py_XDECREF(errorHandler);
6353 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006354 return res;
6355
6356 onError:
6357 Py_XDECREF(res);
6358 Py_XDECREF(errorHandler);
6359 Py_XDECREF(exc);
6360 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361}
6362
Alexander Belopolsky40018472011-02-26 01:02:56 +00006363PyObject *
6364PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006365 Py_ssize_t size,
6366 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369}
6370
Alexander Belopolsky40018472011-02-26 01:02:56 +00006371PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006372_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373{
6374 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 PyErr_BadArgument();
6376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006378 if (PyUnicode_READY(unicode) == -1)
6379 return NULL;
6380 /* Fast path: if it is a one-byte string, construct
6381 bytes object directly. */
6382 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6383 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6384 PyUnicode_GET_LENGTH(unicode));
6385 /* Non-Latin-1 characters present. Defer to above function to
6386 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006389 errors);
6390}
6391
6392PyObject*
6393PyUnicode_AsLatin1String(PyObject *unicode)
6394{
6395 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396}
6397
6398/* --- 7-bit ASCII Codec -------------------------------------------------- */
6399
Alexander Belopolsky40018472011-02-26 01:02:56 +00006400PyObject *
6401PyUnicode_DecodeASCII(const char *s,
6402 Py_ssize_t size,
6403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 PyUnicodeObject *v;
6407 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t startinpos;
6409 Py_ssize_t endinpos;
6410 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006412 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 PyObject *errorHandler = NULL;
6414 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006415 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006418 if (size == 1 && *(unsigned char*)s < 128)
6419 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6420
6421 /* Fast path. Assume the input actually *is* ASCII, and allocate
6422 a single-block Unicode object with that assumption. If there is
6423 an error, drop the object and start over. */
6424 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6425 if (v == NULL)
6426 goto onError;
6427 d = PyUnicode_1BYTE_DATA(v);
6428 for (i = 0; i < size; i++) {
6429 unsigned char ch = ((unsigned char*)s)[i];
6430 if (ch < 128)
6431 d[i] = ch;
6432 else
6433 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006435 if (i == size)
6436 return (PyObject*)v;
6437 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 v = _PyUnicode_New(size);
6440 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006445 e = s + size;
6446 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 register unsigned char c = (unsigned char)*s;
6448 if (c < 128) {
6449 *p++ = c;
6450 ++s;
6451 }
6452 else {
6453 startinpos = s-starts;
6454 endinpos = startinpos + 1;
6455 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6456 if (unicode_decode_call_errorhandler(
6457 errors, &errorHandler,
6458 "ascii", "ordinal not in range(128)",
6459 &starts, &e, &startinpos, &endinpos, &exc, &s,
6460 &v, &outpos, &p))
6461 goto onError;
6462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006464 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006465 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 Py_XDECREF(errorHandler);
6468 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006469 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006470 Py_DECREF(v);
6471 return NULL;
6472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006474
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477 Py_XDECREF(errorHandler);
6478 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479 return NULL;
6480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
6483PyUnicode_EncodeASCII(const Py_UNICODE *p,
6484 Py_ssize_t size,
6485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488}
6489
Alexander Belopolsky40018472011-02-26 01:02:56 +00006490PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006491_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492{
6493 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 PyErr_BadArgument();
6495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497 if (PyUnicode_READY(unicode) == -1)
6498 return NULL;
6499 /* Fast path: if it is an ASCII-only string, construct bytes object
6500 directly. Else defer to above function to raise the exception. */
6501 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6502 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6503 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006506 errors);
6507}
6508
6509PyObject *
6510PyUnicode_AsASCIIString(PyObject *unicode)
6511{
6512 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513}
6514
Victor Stinner99b95382011-07-04 14:23:54 +02006515#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006516
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006517/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006518
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006519#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006520#define NEED_RETRY
6521#endif
6522
6523/* XXX This code is limited to "true" double-byte encodings, as
6524 a) it assumes an incomplete character consists of a single byte, and
6525 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006527
Alexander Belopolsky40018472011-02-26 01:02:56 +00006528static int
6529is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006530{
6531 const char *curr = s + offset;
6532
6533 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 const char *prev = CharPrev(s, curr);
6535 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536 }
6537 return 0;
6538}
6539
6540/*
6541 * Decode MBCS string into unicode object. If 'final' is set, converts
6542 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6543 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006544static int
6545decode_mbcs(PyUnicodeObject **v,
6546 const char *s, /* MBCS string */
6547 int size, /* sizeof MBCS string */
6548 int final,
6549 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006550{
6551 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006552 Py_ssize_t n;
6553 DWORD usize;
6554 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006555
6556 assert(size >= 0);
6557
Victor Stinner554f3f02010-06-16 23:33:54 +00006558 /* check and handle 'errors' arg */
6559 if (errors==NULL || strcmp(errors, "strict")==0)
6560 flags = MB_ERR_INVALID_CHARS;
6561 else if (strcmp(errors, "ignore")==0)
6562 flags = 0;
6563 else {
6564 PyErr_Format(PyExc_ValueError,
6565 "mbcs encoding does not support errors='%s'",
6566 errors);
6567 return -1;
6568 }
6569
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006570 /* Skip trailing lead-byte unless 'final' is set */
6571 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573
6574 /* First get the size of the result */
6575 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006576 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6577 if (usize==0)
6578 goto mbcs_decode_error;
6579 } else
6580 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006581
6582 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 /* Create unicode object */
6584 *v = _PyUnicode_New(usize);
6585 if (*v == NULL)
6586 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006587 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006588 }
6589 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 /* Extend unicode object */
6591 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006592 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594 }
6595
6596 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006597 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006599 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6600 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006603 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006604
6605mbcs_decode_error:
6606 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6607 we raise a UnicodeDecodeError - else it is a 'generic'
6608 windows error
6609 */
6610 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6611 /* Ideally, we should get reason from FormatMessage - this
6612 is the Windows 2000 English version of the message
6613 */
6614 PyObject *exc = NULL;
6615 const char *reason = "No mapping for the Unicode character exists "
6616 "in the target multi-byte code page.";
6617 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6618 if (exc != NULL) {
6619 PyCodec_StrictErrors(exc);
6620 Py_DECREF(exc);
6621 }
6622 } else {
6623 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6624 }
6625 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626}
6627
Alexander Belopolsky40018472011-02-26 01:02:56 +00006628PyObject *
6629PyUnicode_DecodeMBCSStateful(const char *s,
6630 Py_ssize_t size,
6631 const char *errors,
6632 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006633{
6634 PyUnicodeObject *v = NULL;
6635 int done;
6636
6637 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006639
6640#ifdef NEED_RETRY
6641 retry:
6642 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006643 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006644 else
6645#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006646 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006647
6648 if (done < 0) {
6649 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651 }
6652
6653 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006655
6656#ifdef NEED_RETRY
6657 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 s += done;
6659 size -= done;
6660 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661 }
6662#endif
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006663 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006664 Py_DECREF(v);
6665 return NULL;
6666 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667 return (PyObject *)v;
6668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670PyObject *
6671PyUnicode_DecodeMBCS(const char *s,
6672 Py_ssize_t size,
6673 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006674{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006675 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6676}
6677
6678/*
6679 * Convert unicode into string object (MBCS).
6680 * Returns 0 if succeed, -1 otherwise.
6681 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006682static int
6683encode_mbcs(PyObject **repr,
6684 const Py_UNICODE *p, /* unicode */
6685 int size, /* size of unicode */
6686 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006687{
Victor Stinner554f3f02010-06-16 23:33:54 +00006688 BOOL usedDefaultChar = FALSE;
6689 BOOL *pusedDefaultChar;
6690 int mbcssize;
6691 Py_ssize_t n;
6692 PyObject *exc = NULL;
6693 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694
6695 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006696
Victor Stinner554f3f02010-06-16 23:33:54 +00006697 /* check and handle 'errors' arg */
6698 if (errors==NULL || strcmp(errors, "strict")==0) {
6699 flags = WC_NO_BEST_FIT_CHARS;
6700 pusedDefaultChar = &usedDefaultChar;
6701 } else if (strcmp(errors, "replace")==0) {
6702 flags = 0;
6703 pusedDefaultChar = NULL;
6704 } else {
6705 PyErr_Format(PyExc_ValueError,
6706 "mbcs encoding does not support errors='%s'",
6707 errors);
6708 return -1;
6709 }
6710
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006711 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006712 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006713 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6714 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 if (mbcssize == 0) {
6716 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6717 return -1;
6718 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006719 /* If we used a default char, then we failed! */
6720 if (pusedDefaultChar && *pusedDefaultChar)
6721 goto mbcs_encode_error;
6722 } else {
6723 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006724 }
6725
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* Create string object */
6728 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6729 if (*repr == NULL)
6730 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006731 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 }
6733 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 /* Extend string object */
6735 n = PyBytes_Size(*repr);
6736 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6737 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006738 }
6739
6740 /* Do the conversion */
6741 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006743 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6744 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6746 return -1;
6747 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006748 if (pusedDefaultChar && *pusedDefaultChar)
6749 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006751 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006752
6753mbcs_encode_error:
6754 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6755 Py_XDECREF(exc);
6756 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759PyObject *
6760PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6761 Py_ssize_t size,
6762 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006763{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006764 PyObject *repr = NULL;
6765 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006766
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006768 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006770 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771 else
6772#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006773 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006774
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006775 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 Py_XDECREF(repr);
6777 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006778 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779
6780#ifdef NEED_RETRY
6781 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 p += INT_MAX;
6783 size -= INT_MAX;
6784 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006785 }
6786#endif
6787
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006788 return repr;
6789}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006790
Alexander Belopolsky40018472011-02-26 01:02:56 +00006791PyObject *
6792PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006793{
6794 if (!PyUnicode_Check(unicode)) {
6795 PyErr_BadArgument();
6796 return NULL;
6797 }
6798 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 PyUnicode_GET_SIZE(unicode),
6800 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006801}
6802
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803#undef NEED_RETRY
6804
Victor Stinner99b95382011-07-04 14:23:54 +02006805#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006806
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807/* --- Character Mapping Codec -------------------------------------------- */
6808
Alexander Belopolsky40018472011-02-26 01:02:56 +00006809PyObject *
6810PyUnicode_DecodeCharmap(const char *s,
6811 Py_ssize_t size,
6812 PyObject *mapping,
6813 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006815 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006816 Py_ssize_t startinpos;
6817 Py_ssize_t endinpos;
6818 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006819 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 PyUnicodeObject *v;
6821 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006822 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823 PyObject *errorHandler = NULL;
6824 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006825 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006826 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006827
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 /* Default to Latin-1 */
6829 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
6832 v = _PyUnicode_New(size);
6833 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006839 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 mapstring = PyUnicode_AS_UNICODE(mapping);
6841 maplen = PyUnicode_GET_SIZE(mapping);
6842 while (s < e) {
6843 unsigned char ch = *s;
6844 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 if (ch < maplen)
6847 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 if (x == 0xfffe) {
6850 /* undefined mapping */
6851 outpos = p-PyUnicode_AS_UNICODE(v);
6852 startinpos = s-starts;
6853 endinpos = startinpos+1;
6854 if (unicode_decode_call_errorhandler(
6855 errors, &errorHandler,
6856 "charmap", "character maps to <undefined>",
6857 &starts, &e, &startinpos, &endinpos, &exc, &s,
6858 &v, &outpos, &p)) {
6859 goto onError;
6860 }
6861 continue;
6862 }
6863 *p++ = x;
6864 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006865 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006866 }
6867 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 while (s < e) {
6869 unsigned char ch = *s;
6870 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6873 w = PyLong_FromLong((long)ch);
6874 if (w == NULL)
6875 goto onError;
6876 x = PyObject_GetItem(mapping, w);
6877 Py_DECREF(w);
6878 if (x == NULL) {
6879 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6880 /* No mapping found means: mapping is undefined. */
6881 PyErr_Clear();
6882 x = Py_None;
6883 Py_INCREF(x);
6884 } else
6885 goto onError;
6886 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006887
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 /* Apply mapping */
6889 if (PyLong_Check(x)) {
6890 long value = PyLong_AS_LONG(x);
6891 if (value < 0 || value > 65535) {
6892 PyErr_SetString(PyExc_TypeError,
6893 "character mapping must be in range(65536)");
6894 Py_DECREF(x);
6895 goto onError;
6896 }
6897 *p++ = (Py_UNICODE)value;
6898 }
6899 else if (x == Py_None) {
6900 /* undefined mapping */
6901 outpos = p-PyUnicode_AS_UNICODE(v);
6902 startinpos = s-starts;
6903 endinpos = startinpos+1;
6904 if (unicode_decode_call_errorhandler(
6905 errors, &errorHandler,
6906 "charmap", "character maps to <undefined>",
6907 &starts, &e, &startinpos, &endinpos, &exc, &s,
6908 &v, &outpos, &p)) {
6909 Py_DECREF(x);
6910 goto onError;
6911 }
6912 Py_DECREF(x);
6913 continue;
6914 }
6915 else if (PyUnicode_Check(x)) {
6916 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006917
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 if (targetsize == 1)
6919 /* 1-1 mapping */
6920 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006921
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 else if (targetsize > 1) {
6923 /* 1-n mapping */
6924 if (targetsize > extrachars) {
6925 /* resize first */
6926 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6927 Py_ssize_t needed = (targetsize - extrachars) + \
6928 (targetsize << 2);
6929 extrachars += needed;
6930 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006931 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 PyUnicode_GET_SIZE(v) + needed) < 0) {
6933 Py_DECREF(x);
6934 goto onError;
6935 }
6936 p = PyUnicode_AS_UNICODE(v) + oldpos;
6937 }
6938 Py_UNICODE_COPY(p,
6939 PyUnicode_AS_UNICODE(x),
6940 targetsize);
6941 p += targetsize;
6942 extrachars -= targetsize;
6943 }
6944 /* 1-0 mapping: skip the character */
6945 }
6946 else {
6947 /* wrong return value */
6948 PyErr_SetString(PyExc_TypeError,
6949 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006950 Py_DECREF(x);
6951 goto onError;
6952 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 Py_DECREF(x);
6954 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 }
6957 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006958 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006960 Py_XDECREF(errorHandler);
6961 Py_XDECREF(exc);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006962 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006963 Py_DECREF(v);
6964 return NULL;
6965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006967
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006969 Py_XDECREF(errorHandler);
6970 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 Py_XDECREF(v);
6972 return NULL;
6973}
6974
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006975/* Charmap encoding: the lookup table */
6976
Alexander Belopolsky40018472011-02-26 01:02:56 +00006977struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 PyObject_HEAD
6979 unsigned char level1[32];
6980 int count2, count3;
6981 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006982};
6983
6984static PyObject*
6985encoding_map_size(PyObject *obj, PyObject* args)
6986{
6987 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006990}
6991
6992static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006993 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006994 PyDoc_STR("Return the size (in bytes) of this object") },
6995 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006996};
6997
6998static void
6999encoding_map_dealloc(PyObject* o)
7000{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007001 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007002}
7003
7004static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007005 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007006 "EncodingMap", /*tp_name*/
7007 sizeof(struct encoding_map), /*tp_basicsize*/
7008 0, /*tp_itemsize*/
7009 /* methods */
7010 encoding_map_dealloc, /*tp_dealloc*/
7011 0, /*tp_print*/
7012 0, /*tp_getattr*/
7013 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007014 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 0, /*tp_repr*/
7016 0, /*tp_as_number*/
7017 0, /*tp_as_sequence*/
7018 0, /*tp_as_mapping*/
7019 0, /*tp_hash*/
7020 0, /*tp_call*/
7021 0, /*tp_str*/
7022 0, /*tp_getattro*/
7023 0, /*tp_setattro*/
7024 0, /*tp_as_buffer*/
7025 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7026 0, /*tp_doc*/
7027 0, /*tp_traverse*/
7028 0, /*tp_clear*/
7029 0, /*tp_richcompare*/
7030 0, /*tp_weaklistoffset*/
7031 0, /*tp_iter*/
7032 0, /*tp_iternext*/
7033 encoding_map_methods, /*tp_methods*/
7034 0, /*tp_members*/
7035 0, /*tp_getset*/
7036 0, /*tp_base*/
7037 0, /*tp_dict*/
7038 0, /*tp_descr_get*/
7039 0, /*tp_descr_set*/
7040 0, /*tp_dictoffset*/
7041 0, /*tp_init*/
7042 0, /*tp_alloc*/
7043 0, /*tp_new*/
7044 0, /*tp_free*/
7045 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007046};
7047
7048PyObject*
7049PyUnicode_BuildEncodingMap(PyObject* string)
7050{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007051 PyObject *result;
7052 struct encoding_map *mresult;
7053 int i;
7054 int need_dict = 0;
7055 unsigned char level1[32];
7056 unsigned char level2[512];
7057 unsigned char *mlevel1, *mlevel2, *mlevel3;
7058 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007059 int kind;
7060 void *data;
7061 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007063 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007064 PyErr_BadArgument();
7065 return NULL;
7066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007067 kind = PyUnicode_KIND(string);
7068 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007069 memset(level1, 0xFF, sizeof level1);
7070 memset(level2, 0xFF, sizeof level2);
7071
7072 /* If there isn't a one-to-one mapping of NULL to \0,
7073 or if there are non-BMP characters, we need to use
7074 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007075 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007076 need_dict = 1;
7077 for (i = 1; i < 256; i++) {
7078 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007079 ch = PyUnicode_READ(kind, data, i);
7080 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007081 need_dict = 1;
7082 break;
7083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007084 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007085 /* unmapped character */
7086 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007087 l1 = ch >> 11;
7088 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007089 if (level1[l1] == 0xFF)
7090 level1[l1] = count2++;
7091 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007092 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007093 }
7094
7095 if (count2 >= 0xFF || count3 >= 0xFF)
7096 need_dict = 1;
7097
7098 if (need_dict) {
7099 PyObject *result = PyDict_New();
7100 PyObject *key, *value;
7101 if (!result)
7102 return NULL;
7103 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007104 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007105 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007106 if (!key || !value)
7107 goto failed1;
7108 if (PyDict_SetItem(result, key, value) == -1)
7109 goto failed1;
7110 Py_DECREF(key);
7111 Py_DECREF(value);
7112 }
7113 return result;
7114 failed1:
7115 Py_XDECREF(key);
7116 Py_XDECREF(value);
7117 Py_DECREF(result);
7118 return NULL;
7119 }
7120
7121 /* Create a three-level trie */
7122 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7123 16*count2 + 128*count3 - 1);
7124 if (!result)
7125 return PyErr_NoMemory();
7126 PyObject_Init(result, &EncodingMapType);
7127 mresult = (struct encoding_map*)result;
7128 mresult->count2 = count2;
7129 mresult->count3 = count3;
7130 mlevel1 = mresult->level1;
7131 mlevel2 = mresult->level23;
7132 mlevel3 = mresult->level23 + 16*count2;
7133 memcpy(mlevel1, level1, 32);
7134 memset(mlevel2, 0xFF, 16*count2);
7135 memset(mlevel3, 0, 128*count3);
7136 count3 = 0;
7137 for (i = 1; i < 256; i++) {
7138 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007139 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007140 /* unmapped character */
7141 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007142 o1 = PyUnicode_READ(kind, data, i)>>11;
7143 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007144 i2 = 16*mlevel1[o1] + o2;
7145 if (mlevel2[i2] == 0xFF)
7146 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007147 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007148 i3 = 128*mlevel2[i2] + o3;
7149 mlevel3[i3] = i;
7150 }
7151 return result;
7152}
7153
7154static int
7155encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7156{
7157 struct encoding_map *map = (struct encoding_map*)mapping;
7158 int l1 = c>>11;
7159 int l2 = (c>>7) & 0xF;
7160 int l3 = c & 0x7F;
7161 int i;
7162
7163#ifdef Py_UNICODE_WIDE
7164 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007166 }
7167#endif
7168 if (c == 0)
7169 return 0;
7170 /* level 1*/
7171 i = map->level1[l1];
7172 if (i == 0xFF) {
7173 return -1;
7174 }
7175 /* level 2*/
7176 i = map->level23[16*i+l2];
7177 if (i == 0xFF) {
7178 return -1;
7179 }
7180 /* level 3 */
7181 i = map->level23[16*map->count2 + 128*i + l3];
7182 if (i == 0) {
7183 return -1;
7184 }
7185 return i;
7186}
7187
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007188/* Lookup the character ch in the mapping. If the character
7189 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007190 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007191static PyObject *
7192charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193{
Christian Heimes217cfd12007-12-02 14:31:20 +00007194 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007195 PyObject *x;
7196
7197 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007198 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007199 x = PyObject_GetItem(mapping, w);
7200 Py_DECREF(w);
7201 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7203 /* No mapping found means: mapping is undefined. */
7204 PyErr_Clear();
7205 x = Py_None;
7206 Py_INCREF(x);
7207 return x;
7208 } else
7209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007211 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007213 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 long value = PyLong_AS_LONG(x);
7215 if (value < 0 || value > 255) {
7216 PyErr_SetString(PyExc_TypeError,
7217 "character mapping must be in range(256)");
7218 Py_DECREF(x);
7219 return NULL;
7220 }
7221 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007223 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 /* wrong return value */
7227 PyErr_Format(PyExc_TypeError,
7228 "character mapping must return integer, bytes or None, not %.400s",
7229 x->ob_type->tp_name);
7230 Py_DECREF(x);
7231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 }
7233}
7234
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007235static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007236charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007237{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007238 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7239 /* exponentially overallocate to minimize reallocations */
7240 if (requiredsize < 2*outsize)
7241 requiredsize = 2*outsize;
7242 if (_PyBytes_Resize(outobj, requiredsize))
7243 return -1;
7244 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007245}
7246
Benjamin Peterson14339b62009-01-31 16:36:08 +00007247typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007249} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007250/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007251 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252 space is available. Return a new reference to the object that
7253 was put in the output buffer, or Py_None, if the mapping was undefined
7254 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007255 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256static charmapencode_result
7257charmapencode_output(Py_UNICODE c, PyObject *mapping,
7258 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007260 PyObject *rep;
7261 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007262 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263
Christian Heimes90aa7642007-12-19 02:45:37 +00007264 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007265 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 if (res == -1)
7268 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 if (outsize<requiredsize)
7270 if (charmapencode_resize(outobj, outpos, requiredsize))
7271 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007272 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 outstart[(*outpos)++] = (char)res;
7274 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007275 }
7276
7277 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007280 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 Py_DECREF(rep);
7282 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007283 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007284 if (PyLong_Check(rep)) {
7285 Py_ssize_t requiredsize = *outpos+1;
7286 if (outsize<requiredsize)
7287 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7288 Py_DECREF(rep);
7289 return enc_EXCEPTION;
7290 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007291 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007292 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 else {
7295 const char *repchars = PyBytes_AS_STRING(rep);
7296 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7297 Py_ssize_t requiredsize = *outpos+repsize;
7298 if (outsize<requiredsize)
7299 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7300 Py_DECREF(rep);
7301 return enc_EXCEPTION;
7302 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007303 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 memcpy(outstart + *outpos, repchars, repsize);
7305 *outpos += repsize;
7306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007308 Py_DECREF(rep);
7309 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007310}
7311
7312/* handle an error in PyUnicode_EncodeCharmap
7313 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007314static int
7315charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007316 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007317 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007318 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007319 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007320{
7321 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007322 Py_ssize_t repsize;
7323 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007324 Py_UNICODE *uni2;
7325 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007326 Py_ssize_t collstartpos = *inpos;
7327 Py_ssize_t collendpos = *inpos+1;
7328 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007329 char *encoding = "charmap";
7330 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007331 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007333 /* find all unencodable characters */
7334 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007335 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007336 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 int res = encoding_map_lookup(p[collendpos], mapping);
7338 if (res != -1)
7339 break;
7340 ++collendpos;
7341 continue;
7342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 rep = charmapencode_lookup(p[collendpos], mapping);
7345 if (rep==NULL)
7346 return -1;
7347 else if (rep!=Py_None) {
7348 Py_DECREF(rep);
7349 break;
7350 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007351 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007353 }
7354 /* cache callback name lookup
7355 * (if not done yet, i.e. it's the first error) */
7356 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 if ((errors==NULL) || (!strcmp(errors, "strict")))
7358 *known_errorHandler = 1;
7359 else if (!strcmp(errors, "replace"))
7360 *known_errorHandler = 2;
7361 else if (!strcmp(errors, "ignore"))
7362 *known_errorHandler = 3;
7363 else if (!strcmp(errors, "xmlcharrefreplace"))
7364 *known_errorHandler = 4;
7365 else
7366 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007367 }
7368 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007369 case 1: /* strict */
7370 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7371 return -1;
7372 case 2: /* replace */
7373 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 x = charmapencode_output('?', mapping, res, respos);
7375 if (x==enc_EXCEPTION) {
7376 return -1;
7377 }
7378 else if (x==enc_FAILED) {
7379 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7380 return -1;
7381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 }
7383 /* fall through */
7384 case 3: /* ignore */
7385 *inpos = collendpos;
7386 break;
7387 case 4: /* xmlcharrefreplace */
7388 /* generate replacement (temporarily (mis)uses p) */
7389 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 char buffer[2+29+1+1];
7391 char *cp;
7392 sprintf(buffer, "&#%d;", (int)p[collpos]);
7393 for (cp = buffer; *cp; ++cp) {
7394 x = charmapencode_output(*cp, mapping, res, respos);
7395 if (x==enc_EXCEPTION)
7396 return -1;
7397 else if (x==enc_FAILED) {
7398 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7399 return -1;
7400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007401 }
7402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007403 *inpos = collendpos;
7404 break;
7405 default:
7406 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007407 encoding, reason, p, size, exceptionObject,
7408 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007409 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007411 if (PyBytes_Check(repunicode)) {
7412 /* Directly copy bytes result to output. */
7413 Py_ssize_t outsize = PyBytes_Size(*res);
7414 Py_ssize_t requiredsize;
7415 repsize = PyBytes_Size(repunicode);
7416 requiredsize = *respos + repsize;
7417 if (requiredsize > outsize)
7418 /* Make room for all additional bytes. */
7419 if (charmapencode_resize(res, respos, requiredsize)) {
7420 Py_DECREF(repunicode);
7421 return -1;
7422 }
7423 memcpy(PyBytes_AsString(*res) + *respos,
7424 PyBytes_AsString(repunicode), repsize);
7425 *respos += repsize;
7426 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007427 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007428 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007430 /* generate replacement */
7431 repsize = PyUnicode_GET_SIZE(repunicode);
7432 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 x = charmapencode_output(*uni2, mapping, res, respos);
7434 if (x==enc_EXCEPTION) {
7435 return -1;
7436 }
7437 else if (x==enc_FAILED) {
7438 Py_DECREF(repunicode);
7439 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7440 return -1;
7441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007442 }
7443 *inpos = newpos;
7444 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445 }
7446 return 0;
7447}
7448
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449PyObject *
7450PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7451 Py_ssize_t size,
7452 PyObject *mapping,
7453 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007455 /* output object */
7456 PyObject *res = NULL;
7457 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007458 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007459 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007460 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007461 PyObject *errorHandler = NULL;
7462 PyObject *exc = NULL;
7463 /* the following variable is used for caching string comparisons
7464 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7465 * 3=ignore, 4=xmlcharrefreplace */
7466 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468 /* Default to Latin-1 */
7469 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007472 /* allocate enough for a simple encoding without
7473 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007474 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007475 if (res == NULL)
7476 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007477 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007480 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 /* try to encode it */
7482 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7483 if (x==enc_EXCEPTION) /* error */
7484 goto onError;
7485 if (x==enc_FAILED) { /* unencodable character */
7486 if (charmap_encoding_error(p, size, &inpos, mapping,
7487 &exc,
7488 &known_errorHandler, &errorHandler, errors,
7489 &res, &respos)) {
7490 goto onError;
7491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007492 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 else
7494 /* done with this character => adjust input position */
7495 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007499 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007500 if (_PyBytes_Resize(&res, respos) < 0)
7501 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007502
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 Py_XDECREF(exc);
7504 Py_XDECREF(errorHandler);
7505 return res;
7506
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007508 Py_XDECREF(res);
7509 Py_XDECREF(exc);
7510 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007511 return NULL;
7512}
7513
Alexander Belopolsky40018472011-02-26 01:02:56 +00007514PyObject *
7515PyUnicode_AsCharmapString(PyObject *unicode,
7516 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517{
7518 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 PyErr_BadArgument();
7520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 }
7522 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 PyUnicode_GET_SIZE(unicode),
7524 mapping,
7525 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526}
7527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007529static void
7530make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007531 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007532 Py_ssize_t startpos, Py_ssize_t endpos,
7533 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007535 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536 *exceptionObject = _PyUnicodeTranslateError_Create(
7537 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 }
7539 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7541 goto onError;
7542 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7543 goto onError;
7544 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7545 goto onError;
7546 return;
7547 onError:
7548 Py_DECREF(*exceptionObject);
7549 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 }
7551}
7552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007554static void
7555raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007556 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007557 Py_ssize_t startpos, Py_ssize_t endpos,
7558 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559{
7560 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007561 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007562 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564}
7565
7566/* error handling callback helper:
7567 build arguments, call the callback and check the arguments,
7568 put the result into newpos and return the replacement string, which
7569 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007570static PyObject *
7571unicode_translate_call_errorhandler(const char *errors,
7572 PyObject **errorHandler,
7573 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007574 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007575 Py_ssize_t startpos, Py_ssize_t endpos,
7576 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007578 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007580 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581 PyObject *restuple;
7582 PyObject *resunicode;
7583
7584 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 }
7589
7590 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007591 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594
7595 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007597 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007600 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 Py_DECREF(restuple);
7602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603 }
7604 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 &resunicode, &i_newpos)) {
7606 Py_DECREF(restuple);
7607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007608 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007609 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007610 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007611 else
7612 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007613 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7615 Py_DECREF(restuple);
7616 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007618 Py_INCREF(resunicode);
7619 Py_DECREF(restuple);
7620 return resunicode;
7621}
7622
7623/* Lookup the character ch in the mapping and put the result in result,
7624 which must be decrefed by the caller.
7625 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007626static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628{
Christian Heimes217cfd12007-12-02 14:31:20 +00007629 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630 PyObject *x;
7631
7632 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 x = PyObject_GetItem(mapping, w);
7635 Py_DECREF(w);
7636 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7638 /* No mapping found means: use 1:1 mapping. */
7639 PyErr_Clear();
7640 *result = NULL;
7641 return 0;
7642 } else
7643 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007644 }
7645 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 *result = x;
7647 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007648 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007649 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 long value = PyLong_AS_LONG(x);
7651 long max = PyUnicode_GetMax();
7652 if (value < 0 || value > max) {
7653 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007654 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 Py_DECREF(x);
7656 return -1;
7657 }
7658 *result = x;
7659 return 0;
7660 }
7661 else if (PyUnicode_Check(x)) {
7662 *result = x;
7663 return 0;
7664 }
7665 else {
7666 /* wrong return value */
7667 PyErr_SetString(PyExc_TypeError,
7668 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 Py_DECREF(x);
7670 return -1;
7671 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007672}
7673/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 if not reallocate and adjust various state variables.
7675 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007676static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007677charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007681 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 /* exponentially overallocate to minimize reallocations */
7683 if (requiredsize < 2 * oldsize)
7684 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7686 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 }
7690 return 0;
7691}
7692/* lookup the character, put the result in the output string and adjust
7693 various state variables. Return a new reference to the object that
7694 was put in the output buffer in *result, or Py_None, if the mapping was
7695 undefined (in which case no character was written).
7696 The called must decref result.
7697 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007698static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007699charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7700 PyObject *mapping, Py_UCS4 **output,
7701 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007702 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007704 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7705 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007709 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 }
7711 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007713 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007716 }
7717 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007718 Py_ssize_t repsize;
7719 if (PyUnicode_READY(*res) == -1)
7720 return -1;
7721 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 if (repsize==1) {
7723 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 }
7726 else if (repsize!=0) {
7727 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 Py_ssize_t requiredsize = *opos +
7729 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007731 Py_ssize_t i;
7732 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 for(i = 0; i < repsize; i++)
7735 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737 }
7738 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 return 0;
7741}
7742
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007744_PyUnicode_TranslateCharmap(PyObject *input,
7745 PyObject *mapping,
7746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007748 /* input object */
7749 char *idata;
7750 Py_ssize_t size, i;
7751 int kind;
7752 /* output buffer */
7753 Py_UCS4 *output = NULL;
7754 Py_ssize_t osize;
7755 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007757 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007758 char *reason = "character maps to <undefined>";
7759 PyObject *errorHandler = NULL;
7760 PyObject *exc = NULL;
7761 /* the following variable is used for caching string comparisons
7762 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7763 * 3=ignore, 4=xmlcharrefreplace */
7764 int known_errorHandler = -1;
7765
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 PyErr_BadArgument();
7768 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007769 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 if (PyUnicode_READY(input) == -1)
7772 return NULL;
7773 idata = (char*)PyUnicode_DATA(input);
7774 kind = PyUnicode_KIND(input);
7775 size = PyUnicode_GET_LENGTH(input);
7776 i = 0;
7777
7778 if (size == 0) {
7779 Py_INCREF(input);
7780 return input;
7781 }
7782
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007783 /* allocate enough for a simple 1:1 translation without
7784 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007785 osize = size;
7786 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7787 opos = 0;
7788 if (output == NULL) {
7789 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 /* try to encode it */
7795 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796 if (charmaptranslate_output(input, i, mapping,
7797 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 Py_XDECREF(x);
7799 goto onError;
7800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 else { /* untranslatable character */
7805 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7806 Py_ssize_t repsize;
7807 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007808 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 Py_ssize_t collstart = i;
7811 Py_ssize_t collend = i+1;
7812 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007815 while (collend < size) {
7816 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007817 goto onError;
7818 Py_XDECREF(x);
7819 if (x!=Py_None)
7820 break;
7821 ++collend;
7822 }
7823 /* cache callback name lookup
7824 * (if not done yet, i.e. it's the first error) */
7825 if (known_errorHandler==-1) {
7826 if ((errors==NULL) || (!strcmp(errors, "strict")))
7827 known_errorHandler = 1;
7828 else if (!strcmp(errors, "replace"))
7829 known_errorHandler = 2;
7830 else if (!strcmp(errors, "ignore"))
7831 known_errorHandler = 3;
7832 else if (!strcmp(errors, "xmlcharrefreplace"))
7833 known_errorHandler = 4;
7834 else
7835 known_errorHandler = 0;
7836 }
7837 switch (known_errorHandler) {
7838 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 raise_translate_exception(&exc, input, collstart,
7840 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007841 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 case 2: /* replace */
7843 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 for (coll = collstart; coll<collend; coll++)
7845 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 /* fall through */
7847 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 break;
7850 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 /* generate replacement (temporarily (mis)uses i) */
7852 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 char buffer[2+29+1+1];
7854 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7856 if (charmaptranslate_makespace(&output, &osize,
7857 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 goto onError;
7859 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 break;
7864 default:
7865 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 reason, input, &exc,
7867 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007868 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007869 goto onError;
7870 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 repsize = PyUnicode_GET_LENGTH(repunicode);
7872 if (charmaptranslate_makespace(&output, &osize,
7873 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 Py_DECREF(repunicode);
7875 goto onError;
7876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 for (uni2 = 0; repsize-->0; ++uni2)
7878 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7879 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007881 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007882 }
7883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7885 if (!res)
7886 goto onError;
7887 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 Py_XDECREF(exc);
7889 Py_XDECREF(errorHandler);
7890 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 Py_XDECREF(exc);
7895 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 return NULL;
7897}
7898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007899/* Deprecated. Use PyUnicode_Translate instead. */
7900PyObject *
7901PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7902 Py_ssize_t size,
7903 PyObject *mapping,
7904 const char *errors)
7905{
7906 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7907 if (!unicode)
7908 return NULL;
7909 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7910}
7911
Alexander Belopolsky40018472011-02-26 01:02:56 +00007912PyObject *
7913PyUnicode_Translate(PyObject *str,
7914 PyObject *mapping,
7915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916{
7917 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007918
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 str = PyUnicode_FromObject(str);
7920 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 Py_DECREF(str);
7924 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007925
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 Py_XDECREF(str);
7928 return NULL;
7929}
Tim Petersced69f82003-09-16 20:30:58 +00007930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931static Py_UCS4
7932fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7933{
7934 /* No need to call PyUnicode_READY(self) because this function is only
7935 called as a callback from fixup() which does it already. */
7936 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7937 const int kind = PyUnicode_KIND(self);
7938 void *data = PyUnicode_DATA(self);
7939 Py_UCS4 maxchar = 0, ch, fixed;
7940 Py_ssize_t i;
7941
7942 for (i = 0; i < len; ++i) {
7943 ch = PyUnicode_READ(kind, data, i);
7944 fixed = 0;
7945 if (ch > 127) {
7946 if (Py_UNICODE_ISSPACE(ch))
7947 fixed = ' ';
7948 else {
7949 const int decimal = Py_UNICODE_TODECIMAL(ch);
7950 if (decimal >= 0)
7951 fixed = '0' + decimal;
7952 }
7953 if (fixed != 0) {
7954 if (fixed > maxchar)
7955 maxchar = fixed;
7956 PyUnicode_WRITE(kind, data, i, fixed);
7957 }
7958 else if (ch > maxchar)
7959 maxchar = ch;
7960 }
7961 else if (ch > maxchar)
7962 maxchar = ch;
7963 }
7964
7965 return maxchar;
7966}
7967
7968PyObject *
7969_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7970{
7971 if (!PyUnicode_Check(unicode)) {
7972 PyErr_BadInternalCall();
7973 return NULL;
7974 }
7975 if (PyUnicode_READY(unicode) == -1)
7976 return NULL;
7977 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7978 /* If the string is already ASCII, just return the same string */
7979 Py_INCREF(unicode);
7980 return unicode;
7981 }
7982 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7983}
7984
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007985PyObject *
7986PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7987 Py_ssize_t length)
7988{
7989 PyObject *result;
7990 Py_UNICODE *p; /* write pointer into result */
7991 Py_ssize_t i;
7992 /* Copy to a new string */
7993 result = (PyObject *)_PyUnicode_New(length);
7994 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7995 if (result == NULL)
7996 return result;
7997 p = PyUnicode_AS_UNICODE(result);
7998 /* Iterate over code points */
7999 for (i = 0; i < length; i++) {
8000 Py_UNICODE ch =s[i];
8001 if (ch > 127) {
8002 int decimal = Py_UNICODE_TODECIMAL(ch);
8003 if (decimal >= 0)
8004 p[i] = '0' + decimal;
8005 }
8006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008007 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8008 Py_DECREF(result);
8009 return NULL;
8010 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008011 return result;
8012}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008013/* --- Decimal Encoder ---------------------------------------------------- */
8014
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015int
8016PyUnicode_EncodeDecimal(Py_UNICODE *s,
8017 Py_ssize_t length,
8018 char *output,
8019 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008020{
8021 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022 PyObject *errorHandler = NULL;
8023 PyObject *exc = NULL;
8024 const char *encoding = "decimal";
8025 const char *reason = "invalid decimal Unicode string";
8026 /* the following variable is used for caching string comparisons
8027 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8028 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008029
8030 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 PyErr_BadArgument();
8032 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008033 }
8034
8035 p = s;
8036 end = s + length;
8037 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 register Py_UNICODE ch = *p;
8039 int decimal;
8040 PyObject *repunicode;
8041 Py_ssize_t repsize;
8042 Py_ssize_t newpos;
8043 Py_UNICODE *uni2;
8044 Py_UNICODE *collstart;
8045 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008046
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 ++p;
8050 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 decimal = Py_UNICODE_TODECIMAL(ch);
8053 if (decimal >= 0) {
8054 *output++ = '0' + decimal;
8055 ++p;
8056 continue;
8057 }
8058 if (0 < ch && ch < 256) {
8059 *output++ = (char)ch;
8060 ++p;
8061 continue;
8062 }
8063 /* All other characters are considered unencodable */
8064 collstart = p;
8065 collend = p+1;
8066 while (collend < end) {
8067 if ((0 < *collend && *collend < 256) ||
8068 !Py_UNICODE_ISSPACE(*collend) ||
8069 Py_UNICODE_TODECIMAL(*collend))
8070 break;
8071 }
8072 /* cache callback name lookup
8073 * (if not done yet, i.e. it's the first error) */
8074 if (known_errorHandler==-1) {
8075 if ((errors==NULL) || (!strcmp(errors, "strict")))
8076 known_errorHandler = 1;
8077 else if (!strcmp(errors, "replace"))
8078 known_errorHandler = 2;
8079 else if (!strcmp(errors, "ignore"))
8080 known_errorHandler = 3;
8081 else if (!strcmp(errors, "xmlcharrefreplace"))
8082 known_errorHandler = 4;
8083 else
8084 known_errorHandler = 0;
8085 }
8086 switch (known_errorHandler) {
8087 case 1: /* strict */
8088 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8089 goto onError;
8090 case 2: /* replace */
8091 for (p = collstart; p < collend; ++p)
8092 *output++ = '?';
8093 /* fall through */
8094 case 3: /* ignore */
8095 p = collend;
8096 break;
8097 case 4: /* xmlcharrefreplace */
8098 /* generate replacement (temporarily (mis)uses p) */
8099 for (p = collstart; p < collend; ++p)
8100 output += sprintf(output, "&#%d;", (int)*p);
8101 p = collend;
8102 break;
8103 default:
8104 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8105 encoding, reason, s, length, &exc,
8106 collstart-s, collend-s, &newpos);
8107 if (repunicode == NULL)
8108 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008109 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008110 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008111 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8112 Py_DECREF(repunicode);
8113 goto onError;
8114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 /* generate replacement */
8116 repsize = PyUnicode_GET_SIZE(repunicode);
8117 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8118 Py_UNICODE ch = *uni2;
8119 if (Py_UNICODE_ISSPACE(ch))
8120 *output++ = ' ';
8121 else {
8122 decimal = Py_UNICODE_TODECIMAL(ch);
8123 if (decimal >= 0)
8124 *output++ = '0' + decimal;
8125 else if (0 < ch && ch < 256)
8126 *output++ = (char)ch;
8127 else {
8128 Py_DECREF(repunicode);
8129 raise_encode_exception(&exc, encoding,
8130 s, length, collstart-s, collend-s, reason);
8131 goto onError;
8132 }
8133 }
8134 }
8135 p = s + newpos;
8136 Py_DECREF(repunicode);
8137 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008138 }
8139 /* 0-terminate the output string */
8140 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 Py_XDECREF(exc);
8142 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008143 return 0;
8144
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 Py_XDECREF(exc);
8147 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008148 return -1;
8149}
8150
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151/* --- Helpers ------------------------------------------------------------ */
8152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153#include "stringlib/ucs1lib.h"
8154#include "stringlib/fastsearch.h"
8155#include "stringlib/partition.h"
8156#include "stringlib/split.h"
8157#include "stringlib/count.h"
8158#include "stringlib/find.h"
8159#include "stringlib/localeutil.h"
8160#include "stringlib/undef.h"
8161
8162#include "stringlib/ucs2lib.h"
8163#include "stringlib/fastsearch.h"
8164#include "stringlib/partition.h"
8165#include "stringlib/split.h"
8166#include "stringlib/count.h"
8167#include "stringlib/find.h"
8168#include "stringlib/localeutil.h"
8169#include "stringlib/undef.h"
8170
8171#include "stringlib/ucs4lib.h"
8172#include "stringlib/fastsearch.h"
8173#include "stringlib/partition.h"
8174#include "stringlib/split.h"
8175#include "stringlib/count.h"
8176#include "stringlib/find.h"
8177#include "stringlib/localeutil.h"
8178#include "stringlib/undef.h"
8179
8180static Py_ssize_t
8181any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8182 const Py_UCS1*, Py_ssize_t,
8183 Py_ssize_t, Py_ssize_t),
8184 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8185 const Py_UCS2*, Py_ssize_t,
8186 Py_ssize_t, Py_ssize_t),
8187 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8188 const Py_UCS4*, Py_ssize_t,
8189 Py_ssize_t, Py_ssize_t),
8190 PyObject* s1, PyObject* s2,
8191 Py_ssize_t start,
8192 Py_ssize_t end)
8193{
8194 int kind1, kind2, kind;
8195 void *buf1, *buf2;
8196 Py_ssize_t len1, len2, result;
8197
8198 kind1 = PyUnicode_KIND(s1);
8199 kind2 = PyUnicode_KIND(s2);
8200 kind = kind1 > kind2 ? kind1 : kind2;
8201 buf1 = PyUnicode_DATA(s1);
8202 buf2 = PyUnicode_DATA(s2);
8203 if (kind1 != kind)
8204 buf1 = _PyUnicode_AsKind(s1, kind);
8205 if (!buf1)
8206 return -2;
8207 if (kind2 != kind)
8208 buf2 = _PyUnicode_AsKind(s2, kind);
8209 if (!buf2) {
8210 if (kind1 != kind) PyMem_Free(buf1);
8211 return -2;
8212 }
8213 len1 = PyUnicode_GET_LENGTH(s1);
8214 len2 = PyUnicode_GET_LENGTH(s2);
8215
8216 switch(kind) {
8217 case PyUnicode_1BYTE_KIND:
8218 result = ucs1(buf1, len1, buf2, len2, start, end);
8219 break;
8220 case PyUnicode_2BYTE_KIND:
8221 result = ucs2(buf1, len1, buf2, len2, start, end);
8222 break;
8223 case PyUnicode_4BYTE_KIND:
8224 result = ucs4(buf1, len1, buf2, len2, start, end);
8225 break;
8226 default:
8227 assert(0); result = -2;
8228 }
8229
8230 if (kind1 != kind)
8231 PyMem_Free(buf1);
8232 if (kind2 != kind)
8233 PyMem_Free(buf2);
8234
8235 return result;
8236}
8237
8238Py_ssize_t
8239_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8240 Py_ssize_t n_buffer,
8241 void *digits, Py_ssize_t n_digits,
8242 Py_ssize_t min_width,
8243 const char *grouping,
8244 const char *thousands_sep)
8245{
8246 switch(kind) {
8247 case PyUnicode_1BYTE_KIND:
8248 return _PyUnicode_ucs1_InsertThousandsGrouping(
8249 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8250 min_width, grouping, thousands_sep);
8251 case PyUnicode_2BYTE_KIND:
8252 return _PyUnicode_ucs2_InsertThousandsGrouping(
8253 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8254 min_width, grouping, thousands_sep);
8255 case PyUnicode_4BYTE_KIND:
8256 return _PyUnicode_ucs4_InsertThousandsGrouping(
8257 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8258 min_width, grouping, thousands_sep);
8259 }
8260 assert(0);
8261 return -1;
8262}
8263
8264
Eric Smith8c663262007-08-25 02:26:07 +00008265#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008266#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008267
Thomas Wouters477c8d52006-05-27 19:21:47 +00008268#include "stringlib/count.h"
8269#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008270
Thomas Wouters477c8d52006-05-27 19:21:47 +00008271/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008272#define ADJUST_INDICES(start, end, len) \
8273 if (end > len) \
8274 end = len; \
8275 else if (end < 0) { \
8276 end += len; \
8277 if (end < 0) \
8278 end = 0; \
8279 } \
8280 if (start < 0) { \
8281 start += len; \
8282 if (start < 0) \
8283 start = 0; \
8284 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008285
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286Py_ssize_t
8287PyUnicode_Count(PyObject *str,
8288 PyObject *substr,
8289 Py_ssize_t start,
8290 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008292 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008293 PyUnicodeObject* str_obj;
8294 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 int kind1, kind2, kind;
8296 void *buf1 = NULL, *buf2 = NULL;
8297 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008298
Thomas Wouters477c8d52006-05-27 19:21:47 +00008299 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008302 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008303 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 Py_DECREF(str_obj);
8305 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 }
Tim Petersced69f82003-09-16 20:30:58 +00008307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 kind1 = PyUnicode_KIND(str_obj);
8309 kind2 = PyUnicode_KIND(sub_obj);
8310 kind = kind1 > kind2 ? kind1 : kind2;
8311 buf1 = PyUnicode_DATA(str_obj);
8312 if (kind1 != kind)
8313 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8314 if (!buf1)
8315 goto onError;
8316 buf2 = PyUnicode_DATA(sub_obj);
8317 if (kind2 != kind)
8318 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8319 if (!buf2)
8320 goto onError;
8321 len1 = PyUnicode_GET_LENGTH(str_obj);
8322 len2 = PyUnicode_GET_LENGTH(sub_obj);
8323
8324 ADJUST_INDICES(start, end, len1);
8325 switch(kind) {
8326 case PyUnicode_1BYTE_KIND:
8327 result = ucs1lib_count(
8328 ((Py_UCS1*)buf1) + start, end - start,
8329 buf2, len2, PY_SSIZE_T_MAX
8330 );
8331 break;
8332 case PyUnicode_2BYTE_KIND:
8333 result = ucs2lib_count(
8334 ((Py_UCS2*)buf1) + start, end - start,
8335 buf2, len2, PY_SSIZE_T_MAX
8336 );
8337 break;
8338 case PyUnicode_4BYTE_KIND:
8339 result = ucs4lib_count(
8340 ((Py_UCS4*)buf1) + start, end - start,
8341 buf2, len2, PY_SSIZE_T_MAX
8342 );
8343 break;
8344 default:
8345 assert(0); result = 0;
8346 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008347
8348 Py_DECREF(sub_obj);
8349 Py_DECREF(str_obj);
8350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 if (kind1 != kind)
8352 PyMem_Free(buf1);
8353 if (kind2 != kind)
8354 PyMem_Free(buf2);
8355
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 onError:
8358 Py_DECREF(sub_obj);
8359 Py_DECREF(str_obj);
8360 if (kind1 != kind && buf1)
8361 PyMem_Free(buf1);
8362 if (kind2 != kind && buf2)
8363 PyMem_Free(buf2);
8364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365}
8366
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367Py_ssize_t
8368PyUnicode_Find(PyObject *str,
8369 PyObject *sub,
8370 Py_ssize_t start,
8371 Py_ssize_t end,
8372 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008374 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008375
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008379 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 Py_DECREF(str);
8382 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
Tim Petersced69f82003-09-16 20:30:58 +00008384
Thomas Wouters477c8d52006-05-27 19:21:47 +00008385 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 result = any_find_slice(
8387 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8388 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008389 );
8390 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 result = any_find_slice(
8392 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8393 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008394 );
8395
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008397 Py_DECREF(sub);
8398
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399 return result;
8400}
8401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402Py_ssize_t
8403PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8404 Py_ssize_t start, Py_ssize_t end,
8405 int direction)
8406{
8407 char *result;
8408 int kind;
8409 if (PyUnicode_READY(str) == -1)
8410 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008411 if (start < 0 || end < 0) {
8412 PyErr_SetString(PyExc_IndexError, "string index out of range");
8413 return -2;
8414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 if (end > PyUnicode_GET_LENGTH(str))
8416 end = PyUnicode_GET_LENGTH(str);
8417 kind = PyUnicode_KIND(str);
8418 result = findchar(PyUnicode_1BYTE_DATA(str)
8419 + PyUnicode_KIND_SIZE(kind, start),
8420 kind,
8421 end-start, ch, direction);
8422 if (!result)
8423 return -1;
8424 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8425}
8426
Alexander Belopolsky40018472011-02-26 01:02:56 +00008427static int
8428tailmatch(PyUnicodeObject *self,
8429 PyUnicodeObject *substring,
8430 Py_ssize_t start,
8431 Py_ssize_t end,
8432 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 int kind_self;
8435 int kind_sub;
8436 void *data_self;
8437 void *data_sub;
8438 Py_ssize_t offset;
8439 Py_ssize_t i;
8440 Py_ssize_t end_sub;
8441
8442 if (PyUnicode_READY(self) == -1 ||
8443 PyUnicode_READY(substring) == -1)
8444 return 0;
8445
8446 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 return 1;
8448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8450 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 kind_self = PyUnicode_KIND(self);
8455 data_self = PyUnicode_DATA(self);
8456 kind_sub = PyUnicode_KIND(substring);
8457 data_sub = PyUnicode_DATA(substring);
8458 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8459
8460 if (direction > 0)
8461 offset = end;
8462 else
8463 offset = start;
8464
8465 if (PyUnicode_READ(kind_self, data_self, offset) ==
8466 PyUnicode_READ(kind_sub, data_sub, 0) &&
8467 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8468 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8469 /* If both are of the same kind, memcmp is sufficient */
8470 if (kind_self == kind_sub) {
8471 return ! memcmp((char *)data_self +
8472 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8473 data_sub,
8474 PyUnicode_GET_LENGTH(substring) *
8475 PyUnicode_CHARACTER_SIZE(substring));
8476 }
8477 /* otherwise we have to compare each character by first accesing it */
8478 else {
8479 /* We do not need to compare 0 and len(substring)-1 because
8480 the if statement above ensured already that they are equal
8481 when we end up here. */
8482 // TODO: honor direction and do a forward or backwards search
8483 for (i = 1; i < end_sub; ++i) {
8484 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8485 PyUnicode_READ(kind_sub, data_sub, i))
8486 return 0;
8487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 }
8491
8492 return 0;
8493}
8494
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495Py_ssize_t
8496PyUnicode_Tailmatch(PyObject *str,
8497 PyObject *substr,
8498 Py_ssize_t start,
8499 Py_ssize_t end,
8500 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008502 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008503
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 str = PyUnicode_FromObject(str);
8505 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 substr = PyUnicode_FromObject(substr);
8508 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 Py_DECREF(str);
8510 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008511 }
Tim Petersced69f82003-09-16 20:30:58 +00008512
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 (PyUnicodeObject *)substr,
8515 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 Py_DECREF(str);
8517 Py_DECREF(substr);
8518 return result;
8519}
8520
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521/* Apply fixfct filter to the Unicode object self and return a
8522 reference to the modified object */
8523
Alexander Belopolsky40018472011-02-26 01:02:56 +00008524static PyObject *
8525fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 PyObject *u;
8529 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 if (PyUnicode_READY(self) == -1)
8532 return NULL;
8533 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8534 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8535 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8540 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 /* fix functions return the new maximum character in a string,
8543 if the kind of the resulting unicode object does not change,
8544 everything is fine. Otherwise we need to change the string kind
8545 and re-run the fix function. */
8546 maxchar_new = fixfct((PyUnicodeObject*)u);
8547 if (maxchar_new == 0)
8548 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8549 else if (maxchar_new <= 127)
8550 maxchar_new = 127;
8551 else if (maxchar_new <= 255)
8552 maxchar_new = 255;
8553 else if (maxchar_new <= 65535)
8554 maxchar_new = 65535;
8555 else
8556 maxchar_new = 1114111; /* 0x10ffff */
8557
8558 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 /* fixfct should return TRUE if it modified the buffer. If
8560 FALSE, return a reference to the original buffer instead
8561 (to save space, not time) */
8562 Py_INCREF(self);
8563 Py_DECREF(u);
8564 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 else if (maxchar_new == maxchar_old) {
8567 return u;
8568 }
8569 else {
8570 /* In case the maximum character changed, we need to
8571 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008572 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 if (v == NULL) {
8574 Py_DECREF(u);
8575 return NULL;
8576 }
8577 if (maxchar_new > maxchar_old) {
8578 /* If the maxchar increased so that the kind changed, not all
8579 characters are representable anymore and we need to fix the
8580 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008581 if (PyUnicode_CopyCharacters(v, 0,
8582 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008583 PyUnicode_GET_LENGTH(self)) < 0)
8584 {
8585 Py_DECREF(u);
8586 return NULL;
8587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 maxchar_old = fixfct((PyUnicodeObject*)v);
8589 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8590 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008591 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008592 if (PyUnicode_CopyCharacters(v, 0,
8593 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008594 PyUnicode_GET_LENGTH(self)) < 0)
8595 {
8596 Py_DECREF(u);
8597 return NULL;
8598 }
8599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600
8601 Py_DECREF(u);
8602 return v;
8603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008607fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 /* No need to call PyUnicode_READY(self) because this function is only
8610 called as a callback from fixup() which does it already. */
8611 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8612 const int kind = PyUnicode_KIND(self);
8613 void *data = PyUnicode_DATA(self);
8614 int touched = 0;
8615 Py_UCS4 maxchar = 0;
8616 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 for (i = 0; i < len; ++i) {
8619 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8620 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8621 if (up != ch) {
8622 if (up > maxchar)
8623 maxchar = up;
8624 PyUnicode_WRITE(kind, data, i, up);
8625 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 else if (ch > maxchar)
8628 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 }
8630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 if (touched)
8632 return maxchar;
8633 else
8634 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635}
8636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008638fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8641 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8642 const int kind = PyUnicode_KIND(self);
8643 void *data = PyUnicode_DATA(self);
8644 int touched = 0;
8645 Py_UCS4 maxchar = 0;
8646 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 for(i = 0; i < len; ++i) {
8649 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8650 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8651 if (lo != ch) {
8652 if (lo > maxchar)
8653 maxchar = lo;
8654 PyUnicode_WRITE(kind, data, i, lo);
8655 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 else if (ch > maxchar)
8658 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 }
8660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 if (touched)
8662 return maxchar;
8663 else
8664 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665}
8666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008668fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8671 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8672 const int kind = PyUnicode_KIND(self);
8673 void *data = PyUnicode_DATA(self);
8674 int touched = 0;
8675 Py_UCS4 maxchar = 0;
8676 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 for(i = 0; i < len; ++i) {
8679 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8680 Py_UCS4 nu = 0;
8681
8682 if (Py_UNICODE_ISUPPER(ch))
8683 nu = Py_UNICODE_TOLOWER(ch);
8684 else if (Py_UNICODE_ISLOWER(ch))
8685 nu = Py_UNICODE_TOUPPER(ch);
8686
8687 if (nu != 0) {
8688 if (nu > maxchar)
8689 maxchar = nu;
8690 PyUnicode_WRITE(kind, data, i, nu);
8691 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 else if (ch > maxchar)
8694 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 }
8696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 if (touched)
8698 return maxchar;
8699 else
8700 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701}
8702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008704fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8707 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8708 const int kind = PyUnicode_KIND(self);
8709 void *data = PyUnicode_DATA(self);
8710 int touched = 0;
8711 Py_UCS4 maxchar = 0;
8712 Py_ssize_t i = 0;
8713 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008714
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008715 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008717
8718 ch = PyUnicode_READ(kind, data, i);
8719 if (!Py_UNICODE_ISUPPER(ch)) {
8720 maxchar = Py_UNICODE_TOUPPER(ch);
8721 PyUnicode_WRITE(kind, data, i, maxchar);
8722 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 ++i;
8725 for(; i < len; ++i) {
8726 ch = PyUnicode_READ(kind, data, i);
8727 if (!Py_UNICODE_ISLOWER(ch)) {
8728 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8729 if (lo > maxchar)
8730 maxchar = lo;
8731 PyUnicode_WRITE(kind, data, i, lo);
8732 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 else if (ch > maxchar)
8735 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737
8738 if (touched)
8739 return maxchar;
8740 else
8741 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742}
8743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8748 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8749 const int kind = PyUnicode_KIND(self);
8750 void *data = PyUnicode_DATA(self);
8751 Py_UCS4 maxchar = 0;
8752 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 int previous_is_cased;
8754
8755 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 if (len == 1) {
8757 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8758 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8759 if (ti != ch) {
8760 PyUnicode_WRITE(kind, data, i, ti);
8761 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 }
8763 else
8764 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 for(; i < len; ++i) {
8768 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8769 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008770
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 nu = Py_UNICODE_TOTITLE(ch);
8775
8776 if (nu > maxchar)
8777 maxchar = nu;
8778 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008779
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 if (Py_UNICODE_ISLOWER(ch) ||
8781 Py_UNICODE_ISUPPER(ch) ||
8782 Py_UNICODE_ISTITLE(ch))
8783 previous_is_cased = 1;
8784 else
8785 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008788}
8789
Tim Peters8ce9f162004-08-27 01:49:32 +00008790PyObject *
8791PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008794 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008796 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008797 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8798 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008799 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 Py_ssize_t sz, i, res_offset;
8801 Py_UCS4 maxchar = 0;
8802 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803
Tim Peters05eba1f2004-08-27 21:32:02 +00008804 fseq = PySequence_Fast(seq, "");
8805 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008806 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008807 }
8808
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008809 /* NOTE: the following code can't call back into Python code,
8810 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008811 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008812
Tim Peters05eba1f2004-08-27 21:32:02 +00008813 seqlen = PySequence_Fast_GET_SIZE(fseq);
8814 /* If empty sequence, return u"". */
8815 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008817 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008818 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008819 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008820 /* If singleton sequence with an exact Unicode, return that. */
8821 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 item = items[0];
8823 if (PyUnicode_CheckExact(item)) {
8824 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 goto Done;
8827 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008828 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008829 else {
8830 /* Set up sep and seplen */
8831 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 /* fall back to a blank space separator */
8833 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008834 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008836 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008837 else {
8838 if (!PyUnicode_Check(separator)) {
8839 PyErr_Format(PyExc_TypeError,
8840 "separator: expected str instance,"
8841 " %.80s found",
8842 Py_TYPE(separator)->tp_name);
8843 goto onError;
8844 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008845 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 goto onError;
8847 sep = separator;
8848 seplen = PyUnicode_GET_LENGTH(separator);
8849 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8850 /* inc refcount to keep this code path symetric with the
8851 above case of a blank separator */
8852 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008853 }
8854 }
8855
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008856 /* There are at least two things to join, or else we have a subclass
8857 * of str in the sequence.
8858 * Do a pre-pass to figure out the total amount of space we'll
8859 * need (sz), and see whether all argument are strings.
8860 */
8861 sz = 0;
8862 for (i = 0; i < seqlen; i++) {
8863 const Py_ssize_t old_sz = sz;
8864 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 if (!PyUnicode_Check(item)) {
8866 PyErr_Format(PyExc_TypeError,
8867 "sequence item %zd: expected str instance,"
8868 " %.80s found",
8869 i, Py_TYPE(item)->tp_name);
8870 goto onError;
8871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008872 if (PyUnicode_READY(item) == -1)
8873 goto onError;
8874 sz += PyUnicode_GET_LENGTH(item);
8875 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8876 if (item_maxchar > maxchar)
8877 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008878 if (i != 0)
8879 sz += seplen;
8880 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8881 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008883 goto onError;
8884 }
8885 }
Tim Petersced69f82003-09-16 20:30:58 +00008886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008887 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008888 if (res == NULL)
8889 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008890
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008891 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02008893 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008894 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02008896 if (i && seplen != 0) {
8897 copied = PyUnicode_CopyCharacters(res, res_offset,
8898 sep, 0, seplen);
8899 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008900 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008901#ifdef Py_DEBUG
8902 res_offset += copied;
8903#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02008905#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02008907 itemlen = PyUnicode_GET_LENGTH(item);
8908 if (itemlen != 0) {
8909 copied = PyUnicode_CopyCharacters(res, res_offset,
8910 item, 0, itemlen);
8911 if (copied < 0)
8912 goto onError;
8913#ifdef Py_DEBUG
8914 res_offset += copied;
8915#else
8916 res_offset += itemlen;
8917#endif
8918 }
Tim Peters05eba1f2004-08-27 21:32:02 +00008919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008921
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008923 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 Py_XDECREF(sep);
8925 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008928 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008930 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931 return NULL;
8932}
8933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934#define FILL(kind, data, value, start, length) \
8935 do { \
8936 Py_ssize_t i_ = 0; \
8937 assert(kind != PyUnicode_WCHAR_KIND); \
8938 switch ((kind)) { \
8939 case PyUnicode_1BYTE_KIND: { \
8940 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8941 memset(to_, (unsigned char)value, length); \
8942 break; \
8943 } \
8944 case PyUnicode_2BYTE_KIND: { \
8945 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8946 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8947 break; \
8948 } \
8949 default: { \
8950 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8951 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8952 break; \
8953 } \
8954 } \
8955 } while (0)
8956
Alexander Belopolsky40018472011-02-26 01:02:56 +00008957static PyUnicodeObject *
8958pad(PyUnicodeObject *self,
8959 Py_ssize_t left,
8960 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 PyObject *u;
8964 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008965 int kind;
8966 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967
8968 if (left < 0)
8969 left = 0;
8970 if (right < 0)
8971 right = 0;
8972
Tim Peters7a29bd52001-09-12 03:03:31 +00008973 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 Py_INCREF(self);
8975 return self;
8976 }
8977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8979 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008980 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8981 return NULL;
8982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8984 if (fill > maxchar)
8985 maxchar = fill;
8986 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008987 if (!u)
8988 return NULL;
8989
8990 kind = PyUnicode_KIND(u);
8991 data = PyUnicode_DATA(u);
8992 if (left)
8993 FILL(kind, data, fill, 0, left);
8994 if (right)
8995 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008996 if (PyUnicode_CopyCharacters(u, left,
8997 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008998 _PyUnicode_LENGTH(self)) < 0)
8999 {
9000 Py_DECREF(u);
9001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
9003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007
Alexander Belopolsky40018472011-02-26 01:02:56 +00009008PyObject *
9009PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012
9013 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 switch(PyUnicode_KIND(string)) {
9018 case PyUnicode_1BYTE_KIND:
9019 list = ucs1lib_splitlines(
9020 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9021 PyUnicode_GET_LENGTH(string), keepends);
9022 break;
9023 case PyUnicode_2BYTE_KIND:
9024 list = ucs2lib_splitlines(
9025 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9026 PyUnicode_GET_LENGTH(string), keepends);
9027 break;
9028 case PyUnicode_4BYTE_KIND:
9029 list = ucs4lib_splitlines(
9030 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9031 PyUnicode_GET_LENGTH(string), keepends);
9032 break;
9033 default:
9034 assert(0);
9035 list = 0;
9036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 Py_DECREF(string);
9038 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039}
9040
Alexander Belopolsky40018472011-02-26 01:02:56 +00009041static PyObject *
9042split(PyUnicodeObject *self,
9043 PyUnicodeObject *substring,
9044 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 int kind1, kind2, kind;
9047 void *buf1, *buf2;
9048 Py_ssize_t len1, len2;
9049 PyObject* out;
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009052 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (PyUnicode_READY(self) == -1)
9055 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (substring == NULL)
9058 switch(PyUnicode_KIND(self)) {
9059 case PyUnicode_1BYTE_KIND:
9060 return ucs1lib_split_whitespace(
9061 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9062 PyUnicode_GET_LENGTH(self), maxcount
9063 );
9064 case PyUnicode_2BYTE_KIND:
9065 return ucs2lib_split_whitespace(
9066 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9067 PyUnicode_GET_LENGTH(self), maxcount
9068 );
9069 case PyUnicode_4BYTE_KIND:
9070 return ucs4lib_split_whitespace(
9071 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9072 PyUnicode_GET_LENGTH(self), maxcount
9073 );
9074 default:
9075 assert(0);
9076 return NULL;
9077 }
9078
9079 if (PyUnicode_READY(substring) == -1)
9080 return NULL;
9081
9082 kind1 = PyUnicode_KIND(self);
9083 kind2 = PyUnicode_KIND(substring);
9084 kind = kind1 > kind2 ? kind1 : kind2;
9085 buf1 = PyUnicode_DATA(self);
9086 buf2 = PyUnicode_DATA(substring);
9087 if (kind1 != kind)
9088 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9089 if (!buf1)
9090 return NULL;
9091 if (kind2 != kind)
9092 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9093 if (!buf2) {
9094 if (kind1 != kind) PyMem_Free(buf1);
9095 return NULL;
9096 }
9097 len1 = PyUnicode_GET_LENGTH(self);
9098 len2 = PyUnicode_GET_LENGTH(substring);
9099
9100 switch(kind) {
9101 case PyUnicode_1BYTE_KIND:
9102 out = ucs1lib_split(
9103 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9104 break;
9105 case PyUnicode_2BYTE_KIND:
9106 out = ucs2lib_split(
9107 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9108 break;
9109 case PyUnicode_4BYTE_KIND:
9110 out = ucs4lib_split(
9111 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9112 break;
9113 default:
9114 out = NULL;
9115 }
9116 if (kind1 != kind)
9117 PyMem_Free(buf1);
9118 if (kind2 != kind)
9119 PyMem_Free(buf2);
9120 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121}
9122
Alexander Belopolsky40018472011-02-26 01:02:56 +00009123static PyObject *
9124rsplit(PyUnicodeObject *self,
9125 PyUnicodeObject *substring,
9126 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 int kind1, kind2, kind;
9129 void *buf1, *buf2;
9130 Py_ssize_t len1, len2;
9131 PyObject* out;
9132
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009133 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009134 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 if (PyUnicode_READY(self) == -1)
9137 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (substring == NULL)
9140 switch(PyUnicode_KIND(self)) {
9141 case PyUnicode_1BYTE_KIND:
9142 return ucs1lib_rsplit_whitespace(
9143 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9144 PyUnicode_GET_LENGTH(self), maxcount
9145 );
9146 case PyUnicode_2BYTE_KIND:
9147 return ucs2lib_rsplit_whitespace(
9148 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9149 PyUnicode_GET_LENGTH(self), maxcount
9150 );
9151 case PyUnicode_4BYTE_KIND:
9152 return ucs4lib_rsplit_whitespace(
9153 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9154 PyUnicode_GET_LENGTH(self), maxcount
9155 );
9156 default:
9157 assert(0);
9158 return NULL;
9159 }
9160
9161 if (PyUnicode_READY(substring) == -1)
9162 return NULL;
9163
9164 kind1 = PyUnicode_KIND(self);
9165 kind2 = PyUnicode_KIND(substring);
9166 kind = kind1 > kind2 ? kind1 : kind2;
9167 buf1 = PyUnicode_DATA(self);
9168 buf2 = PyUnicode_DATA(substring);
9169 if (kind1 != kind)
9170 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9171 if (!buf1)
9172 return NULL;
9173 if (kind2 != kind)
9174 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9175 if (!buf2) {
9176 if (kind1 != kind) PyMem_Free(buf1);
9177 return NULL;
9178 }
9179 len1 = PyUnicode_GET_LENGTH(self);
9180 len2 = PyUnicode_GET_LENGTH(substring);
9181
9182 switch(kind) {
9183 case PyUnicode_1BYTE_KIND:
9184 out = ucs1lib_rsplit(
9185 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9186 break;
9187 case PyUnicode_2BYTE_KIND:
9188 out = ucs2lib_rsplit(
9189 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9190 break;
9191 case PyUnicode_4BYTE_KIND:
9192 out = ucs4lib_rsplit(
9193 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9194 break;
9195 default:
9196 out = NULL;
9197 }
9198 if (kind1 != kind)
9199 PyMem_Free(buf1);
9200 if (kind2 != kind)
9201 PyMem_Free(buf2);
9202 return out;
9203}
9204
9205static Py_ssize_t
9206anylib_find(int kind, void *buf1, Py_ssize_t len1,
9207 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9208{
9209 switch(kind) {
9210 case PyUnicode_1BYTE_KIND:
9211 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9212 case PyUnicode_2BYTE_KIND:
9213 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9214 case PyUnicode_4BYTE_KIND:
9215 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9216 }
9217 assert(0);
9218 return -1;
9219}
9220
9221static Py_ssize_t
9222anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9223 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9224{
9225 switch(kind) {
9226 case PyUnicode_1BYTE_KIND:
9227 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9228 case PyUnicode_2BYTE_KIND:
9229 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9230 case PyUnicode_4BYTE_KIND:
9231 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9232 }
9233 assert(0);
9234 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009235}
9236
Alexander Belopolsky40018472011-02-26 01:02:56 +00009237static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238replace(PyObject *self, PyObject *str1,
9239 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 PyObject *u;
9242 char *sbuf = PyUnicode_DATA(self);
9243 char *buf1 = PyUnicode_DATA(str1);
9244 char *buf2 = PyUnicode_DATA(str2);
9245 int srelease = 0, release1 = 0, release2 = 0;
9246 int skind = PyUnicode_KIND(self);
9247 int kind1 = PyUnicode_KIND(str1);
9248 int kind2 = PyUnicode_KIND(str2);
9249 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9250 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9251 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252
9253 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009256 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 if (skind < kind1)
9259 /* substring too wide to be present */
9260 goto nothing;
9261
9262 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009263 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009264 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009266 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009268 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 Py_UCS4 u1, u2, maxchar;
9270 int mayshrink, rkind;
9271 u1 = PyUnicode_READ_CHAR(str1, 0);
9272 if (!findchar(sbuf, PyUnicode_KIND(self),
9273 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009274 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 u2 = PyUnicode_READ_CHAR(str2, 0);
9276 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9277 /* Replacing u1 with u2 may cause a maxchar reduction in the
9278 result string. */
9279 mayshrink = maxchar > 127;
9280 if (u2 > maxchar) {
9281 maxchar = u2;
9282 mayshrink = 0;
9283 }
9284 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009285 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009287 if (PyUnicode_CopyCharacters(u, 0,
9288 (PyObject*)self, 0, slen) < 0)
9289 {
9290 Py_DECREF(u);
9291 return NULL;
9292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 rkind = PyUnicode_KIND(u);
9294 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9295 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009296 if (--maxcount < 0)
9297 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 if (mayshrink) {
9301 PyObject *tmp = u;
9302 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9303 PyUnicode_GET_LENGTH(tmp));
9304 Py_DECREF(tmp);
9305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 int rkind = skind;
9308 char *res;
9309 if (kind1 < rkind) {
9310 /* widen substring */
9311 buf1 = _PyUnicode_AsKind(str1, rkind);
9312 if (!buf1) goto error;
9313 release1 = 1;
9314 }
9315 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009316 if (i < 0)
9317 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (rkind > kind2) {
9319 /* widen replacement */
9320 buf2 = _PyUnicode_AsKind(str2, rkind);
9321 if (!buf2) goto error;
9322 release2 = 1;
9323 }
9324 else if (rkind < kind2) {
9325 /* widen self and buf1 */
9326 rkind = kind2;
9327 if (release1) PyMem_Free(buf1);
9328 sbuf = _PyUnicode_AsKind(self, rkind);
9329 if (!sbuf) goto error;
9330 srelease = 1;
9331 buf1 = _PyUnicode_AsKind(str1, rkind);
9332 if (!buf1) goto error;
9333 release1 = 1;
9334 }
9335 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9336 if (!res) {
9337 PyErr_NoMemory();
9338 goto error;
9339 }
9340 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009341 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9343 buf2,
9344 PyUnicode_KIND_SIZE(rkind, len2));
9345 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009346
9347 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9349 slen-i,
9350 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009351 if (i == -1)
9352 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9354 buf2,
9355 PyUnicode_KIND_SIZE(rkind, len2));
9356 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009358
9359 u = PyUnicode_FromKindAndData(rkind, res, slen);
9360 PyMem_Free(res);
9361 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 Py_ssize_t n, i, j, ires;
9366 Py_ssize_t product, new_size;
9367 int rkind = skind;
9368 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 if (kind1 < rkind) {
9371 buf1 = _PyUnicode_AsKind(str1, rkind);
9372 if (!buf1) goto error;
9373 release1 = 1;
9374 }
9375 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009376 if (n == 0)
9377 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 if (kind2 < rkind) {
9379 buf2 = _PyUnicode_AsKind(str2, rkind);
9380 if (!buf2) goto error;
9381 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 else if (kind2 > rkind) {
9384 rkind = kind2;
9385 sbuf = _PyUnicode_AsKind(self, rkind);
9386 if (!sbuf) goto error;
9387 srelease = 1;
9388 if (release1) PyMem_Free(buf1);
9389 buf1 = _PyUnicode_AsKind(str1, rkind);
9390 if (!buf1) goto error;
9391 release1 = 1;
9392 }
9393 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9394 PyUnicode_GET_LENGTH(str1))); */
9395 product = n * (len2-len1);
9396 if ((product / (len2-len1)) != n) {
9397 PyErr_SetString(PyExc_OverflowError,
9398 "replace string is too long");
9399 goto error;
9400 }
9401 new_size = slen + product;
9402 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9403 PyErr_SetString(PyExc_OverflowError,
9404 "replace string is too long");
9405 goto error;
9406 }
9407 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9408 if (!res)
9409 goto error;
9410 ires = i = 0;
9411 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009412 while (n-- > 0) {
9413 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 j = anylib_find(rkind,
9415 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9416 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009417 if (j == -1)
9418 break;
9419 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009420 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9422 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9423 PyUnicode_KIND_SIZE(rkind, j-i));
9424 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009425 }
9426 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 if (len2 > 0) {
9428 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9429 buf2,
9430 PyUnicode_KIND_SIZE(rkind, len2));
9431 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009436 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9438 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9439 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009440 } else {
9441 /* interleave */
9442 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9444 buf2,
9445 PyUnicode_KIND_SIZE(rkind, len2));
9446 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009447 if (--n <= 0)
9448 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9450 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9451 PyUnicode_KIND_SIZE(rkind, 1));
9452 ires++;
9453 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9456 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9457 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009460 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 if (srelease)
9463 PyMem_FREE(sbuf);
9464 if (release1)
9465 PyMem_FREE(buf1);
9466 if (release2)
9467 PyMem_FREE(buf2);
9468 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009469
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009471 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 if (srelease)
9473 PyMem_FREE(sbuf);
9474 if (release1)
9475 PyMem_FREE(buf1);
9476 if (release2)
9477 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 if (PyUnicode_CheckExact(self)) {
9479 Py_INCREF(self);
9480 return (PyObject *) self;
9481 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009482 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 error:
9484 if (srelease && sbuf)
9485 PyMem_FREE(sbuf);
9486 if (release1 && buf1)
9487 PyMem_FREE(buf1);
9488 if (release2 && buf2)
9489 PyMem_FREE(buf2);
9490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491}
9492
9493/* --- Unicode Object Methods --------------------------------------------- */
9494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009495PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497\n\
9498Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009499characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500
9501static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009502unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504 return fixup(self, fixtitle);
9505}
9506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009507PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509\n\
9510Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009511have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512
9513static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009514unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 return fixup(self, fixcapitalize);
9517}
9518
9519#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009520PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522\n\
9523Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009524normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525
9526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009527unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528{
9529 PyObject *list;
9530 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009531 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533 /* Split into words */
9534 list = split(self, NULL, -1);
9535 if (!list)
9536 return NULL;
9537
9538 /* Capitalize each word */
9539 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9540 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542 if (item == NULL)
9543 goto onError;
9544 Py_DECREF(PyList_GET_ITEM(list, i));
9545 PyList_SET_ITEM(list, i, item);
9546 }
9547
9548 /* Join the words to form a new string */
9549 item = PyUnicode_Join(NULL, list);
9550
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009552 Py_DECREF(list);
9553 return (PyObject *)item;
9554}
9555#endif
9556
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009557/* Argument converter. Coerces to a single unicode character */
9558
9559static int
9560convert_uc(PyObject *obj, void *addr)
9561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009563 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009564
Benjamin Peterson14339b62009-01-31 16:36:08 +00009565 uniobj = PyUnicode_FromObject(obj);
9566 if (uniobj == NULL) {
9567 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009569 return 0;
9570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009574 Py_DECREF(uniobj);
9575 return 0;
9576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009578 Py_DECREF(uniobj);
9579 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009580}
9581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009582PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009585Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009586done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587
9588static PyObject *
9589unicode_center(PyUnicodeObject *self, PyObject *args)
9590{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009591 Py_ssize_t marg, left;
9592 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 Py_UCS4 fillchar = ' ';
9594
Victor Stinnere9a29352011-10-01 02:14:59 +02009595 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
Victor Stinnere9a29352011-10-01 02:14:59 +02009598 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599 return NULL;
9600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602 Py_INCREF(self);
9603 return (PyObject*) self;
9604 }
9605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607 left = marg / 2 + (marg & width & 1);
9608
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009609 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610}
9611
Marc-André Lemburge5034372000-08-08 08:04:29 +00009612#if 0
9613
9614/* This code should go into some future Unicode collation support
9615 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009616 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009617
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009618/* speedy UTF-16 code point order comparison */
9619/* gleaned from: */
9620/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9621
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009622static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009623{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009624 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009625 0, 0, 0, 0, 0, 0, 0, 0,
9626 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009627 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009628};
9629
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630static int
9631unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9632{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009633 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009634
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 Py_UNICODE *s1 = str1->str;
9636 Py_UNICODE *s2 = str2->str;
9637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 len1 = str1->_base._base.length;
9639 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009640
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009642 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009643
9644 c1 = *s1++;
9645 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009646
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 if (c1 > (1<<11) * 26)
9648 c1 += utf16Fixup[c1>>11];
9649 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009650 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009651 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009652
9653 if (c1 != c2)
9654 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009655
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009656 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657 }
9658
9659 return (len1 < len2) ? -1 : (len1 != len2);
9660}
9661
Marc-André Lemburge5034372000-08-08 08:04:29 +00009662#else
9663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664/* This function assumes that str1 and str2 are readied by the caller. */
9665
Marc-André Lemburge5034372000-08-08 08:04:29 +00009666static int
9667unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 int kind1, kind2;
9670 void *data1, *data2;
9671 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 kind1 = PyUnicode_KIND(str1);
9674 kind2 = PyUnicode_KIND(str2);
9675 data1 = PyUnicode_DATA(str1);
9676 data2 = PyUnicode_DATA(str2);
9677 len1 = PyUnicode_GET_LENGTH(str1);
9678 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 for (i = 0; i < len1 && i < len2; ++i) {
9681 Py_UCS4 c1, c2;
9682 c1 = PyUnicode_READ(kind1, data1, i);
9683 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009684
9685 if (c1 != c2)
9686 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009687 }
9688
9689 return (len1 < len2) ? -1 : (len1 != len2);
9690}
9691
9692#endif
9693
Alexander Belopolsky40018472011-02-26 01:02:56 +00009694int
9695PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9698 if (PyUnicode_READY(left) == -1 ||
9699 PyUnicode_READY(right) == -1)
9700 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009701 return unicode_compare((PyUnicodeObject *)left,
9702 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009704 PyErr_Format(PyExc_TypeError,
9705 "Can't compare %.100s and %.100s",
9706 left->ob_type->tp_name,
9707 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708 return -1;
9709}
9710
Martin v. Löwis5b222132007-06-10 09:51:05 +00009711int
9712PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9713{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 Py_ssize_t i;
9715 int kind;
9716 void *data;
9717 Py_UCS4 chr;
9718
Victor Stinner910337b2011-10-03 03:20:16 +02009719 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 if (PyUnicode_READY(uni) == -1)
9721 return -1;
9722 kind = PyUnicode_KIND(uni);
9723 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009724 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9726 if (chr != str[i])
9727 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009728 /* This check keeps Python strings that end in '\0' from comparing equal
9729 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009731 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009732 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009734 return 0;
9735}
9736
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009737
Benjamin Peterson29060642009-01-31 22:14:21 +00009738#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009739 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009740
Alexander Belopolsky40018472011-02-26 01:02:56 +00009741PyObject *
9742PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009743{
9744 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009745
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009746 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9747 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 if (PyUnicode_READY(left) == -1 ||
9749 PyUnicode_READY(right) == -1)
9750 return NULL;
9751 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9752 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009753 if (op == Py_EQ) {
9754 Py_INCREF(Py_False);
9755 return Py_False;
9756 }
9757 if (op == Py_NE) {
9758 Py_INCREF(Py_True);
9759 return Py_True;
9760 }
9761 }
9762 if (left == right)
9763 result = 0;
9764 else
9765 result = unicode_compare((PyUnicodeObject *)left,
9766 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009767
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009768 /* Convert the return value to a Boolean */
9769 switch (op) {
9770 case Py_EQ:
9771 v = TEST_COND(result == 0);
9772 break;
9773 case Py_NE:
9774 v = TEST_COND(result != 0);
9775 break;
9776 case Py_LE:
9777 v = TEST_COND(result <= 0);
9778 break;
9779 case Py_GE:
9780 v = TEST_COND(result >= 0);
9781 break;
9782 case Py_LT:
9783 v = TEST_COND(result == -1);
9784 break;
9785 case Py_GT:
9786 v = TEST_COND(result == 1);
9787 break;
9788 default:
9789 PyErr_BadArgument();
9790 return NULL;
9791 }
9792 Py_INCREF(v);
9793 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009794 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009795
Brian Curtindfc80e32011-08-10 20:28:54 -05009796 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799int
9800PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009801{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009802 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 int kind1, kind2, kind;
9804 void *buf1, *buf2;
9805 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009806 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009807
9808 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009809 sub = PyUnicode_FromObject(element);
9810 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 PyErr_Format(PyExc_TypeError,
9812 "'in <string>' requires string as left operand, not %s",
9813 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009814 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 if (PyUnicode_READY(sub) == -1)
9817 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009818
Thomas Wouters477c8d52006-05-27 19:21:47 +00009819 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009820 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009821 Py_DECREF(sub);
9822 return -1;
9823 }
9824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 kind1 = PyUnicode_KIND(str);
9826 kind2 = PyUnicode_KIND(sub);
9827 kind = kind1 > kind2 ? kind1 : kind2;
9828 buf1 = PyUnicode_DATA(str);
9829 buf2 = PyUnicode_DATA(sub);
9830 if (kind1 != kind)
9831 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9832 if (!buf1) {
9833 Py_DECREF(sub);
9834 return -1;
9835 }
9836 if (kind2 != kind)
9837 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9838 if (!buf2) {
9839 Py_DECREF(sub);
9840 if (kind1 != kind) PyMem_Free(buf1);
9841 return -1;
9842 }
9843 len1 = PyUnicode_GET_LENGTH(str);
9844 len2 = PyUnicode_GET_LENGTH(sub);
9845
9846 switch(kind) {
9847 case PyUnicode_1BYTE_KIND:
9848 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9849 break;
9850 case PyUnicode_2BYTE_KIND:
9851 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9852 break;
9853 case PyUnicode_4BYTE_KIND:
9854 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9855 break;
9856 default:
9857 result = -1;
9858 assert(0);
9859 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009860
9861 Py_DECREF(str);
9862 Py_DECREF(sub);
9863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 if (kind1 != kind)
9865 PyMem_Free(buf1);
9866 if (kind2 != kind)
9867 PyMem_Free(buf2);
9868
Guido van Rossum403d68b2000-03-13 15:55:09 +00009869 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009870}
9871
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872/* Concat to string or Unicode object giving a new Unicode object. */
9873
Alexander Belopolsky40018472011-02-26 01:02:56 +00009874PyObject *
9875PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 PyObject *u = NULL, *v = NULL, *w;
9878 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009879
9880 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887
9888 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009889 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009893 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009894 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 }
9897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009899 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 w = PyUnicode_New(
9903 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9904 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009907 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9908 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009909 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009910 v, 0,
9911 PyUnicode_GET_LENGTH(v)) < 0)
9912 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913 Py_DECREF(u);
9914 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 Py_XDECREF(u);
9919 Py_XDECREF(v);
9920 return NULL;
9921}
9922
Walter Dörwald1ab83302007-05-18 17:15:44 +00009923void
Victor Stinner23e56682011-10-03 03:54:37 +02009924PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009925{
Victor Stinner23e56682011-10-03 03:54:37 +02009926 PyObject *left, *res;
9927
9928 if (p_left == NULL) {
9929 if (!PyErr_Occurred())
9930 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931 return;
9932 }
Victor Stinner23e56682011-10-03 03:54:37 +02009933 left = *p_left;
9934 if (right == NULL || !PyUnicode_Check(left)) {
9935 if (!PyErr_Occurred())
9936 PyErr_BadInternalCall();
9937 goto error;
9938 }
9939
9940 if (PyUnicode_CheckExact(left) && left != unicode_empty
9941 && PyUnicode_CheckExact(right) && right != unicode_empty
9942 && unicode_resizable(left)
9943 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9944 || _PyUnicode_WSTR(left) != NULL))
9945 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009946 Py_ssize_t left_len, right_len, new_len;
9947#ifdef Py_DEBUG
9948 Py_ssize_t copied;
9949#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009950
Victor Stinner23e56682011-10-03 03:54:37 +02009951 if (PyUnicode_READY(left))
9952 goto error;
9953 if (PyUnicode_READY(right))
9954 goto error;
9955
9956 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9957 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9958 {
Victor Stinnerb8038952011-10-03 23:27:56 +02009959 left_len = PyUnicode_GET_LENGTH(left);
9960 right_len = PyUnicode_GET_LENGTH(right);
9961 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner23e56682011-10-03 03:54:37 +02009962 PyErr_SetString(PyExc_OverflowError,
9963 "strings are too large to concat");
9964 goto error;
9965 }
Victor Stinnerb8038952011-10-03 23:27:56 +02009966 new_len = left_len + right_len;
Victor Stinner23e56682011-10-03 03:54:37 +02009967
9968 /* Now we own the last reference to 'left', so we can resize it
9969 * in-place.
9970 */
9971 if (unicode_resize(&left, new_len) != 0) {
9972 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9973 * deallocated so it cannot be put back into
9974 * 'variable'. The MemoryError is raised when there
9975 * is no value in 'variable', which might (very
9976 * remotely) be a cause of incompatibilities.
9977 */
9978 goto error;
9979 }
9980 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerb8038952011-10-03 23:27:56 +02009981#ifdef Py_DEBUG
9982 copied = PyUnicode_CopyCharacters(left, left_len,
Victor Stinner23e56682011-10-03 03:54:37 +02009983 right, 0,
Victor Stinnerb8038952011-10-03 23:27:56 +02009984 right_len);
Victor Stinner23e56682011-10-03 03:54:37 +02009985 assert(0 <= copied);
Victor Stinnerb8038952011-10-03 23:27:56 +02009986#else
9987 PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
9988#endif
Victor Stinner23e56682011-10-03 03:54:37 +02009989 *p_left = left;
9990 return;
9991 }
9992 }
9993
9994 res = PyUnicode_Concat(left, right);
9995 if (res == NULL)
9996 goto error;
9997 Py_DECREF(left);
9998 *p_left = res;
9999 return;
10000
10001error:
10002 Py_DECREF(*p_left);
10003 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010004}
10005
10006void
10007PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10008{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010009 PyUnicode_Append(pleft, right);
10010 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010011}
10012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010013PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010016Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010017string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010018interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
10020static PyObject *
10021unicode_count(PyUnicodeObject *self, PyObject *args)
10022{
10023 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010024 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010025 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 int kind1, kind2, kind;
10028 void *buf1, *buf2;
10029 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030
Jesus Ceaac451502011-04-20 17:09:23 +020010031 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10032 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 kind1 = PyUnicode_KIND(self);
10036 kind2 = PyUnicode_KIND(substring);
10037 kind = kind1 > kind2 ? kind1 : kind2;
10038 buf1 = PyUnicode_DATA(self);
10039 buf2 = PyUnicode_DATA(substring);
10040 if (kind1 != kind)
10041 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10042 if (!buf1) {
10043 Py_DECREF(substring);
10044 return NULL;
10045 }
10046 if (kind2 != kind)
10047 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10048 if (!buf2) {
10049 Py_DECREF(substring);
10050 if (kind1 != kind) PyMem_Free(buf1);
10051 return NULL;
10052 }
10053 len1 = PyUnicode_GET_LENGTH(self);
10054 len2 = PyUnicode_GET_LENGTH(substring);
10055
10056 ADJUST_INDICES(start, end, len1);
10057 switch(kind) {
10058 case PyUnicode_1BYTE_KIND:
10059 iresult = ucs1lib_count(
10060 ((Py_UCS1*)buf1) + start, end - start,
10061 buf2, len2, PY_SSIZE_T_MAX
10062 );
10063 break;
10064 case PyUnicode_2BYTE_KIND:
10065 iresult = ucs2lib_count(
10066 ((Py_UCS2*)buf1) + start, end - start,
10067 buf2, len2, PY_SSIZE_T_MAX
10068 );
10069 break;
10070 case PyUnicode_4BYTE_KIND:
10071 iresult = ucs4lib_count(
10072 ((Py_UCS4*)buf1) + start, end - start,
10073 buf2, len2, PY_SSIZE_T_MAX
10074 );
10075 break;
10076 default:
10077 assert(0); iresult = 0;
10078 }
10079
10080 result = PyLong_FromSsize_t(iresult);
10081
10082 if (kind1 != kind)
10083 PyMem_Free(buf1);
10084 if (kind2 != kind)
10085 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086
10087 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089 return result;
10090}
10091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010092PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010093 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010095Encode S using the codec registered for encoding. Default encoding\n\
10096is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010097handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010098a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10099'xmlcharrefreplace' as well as any other name registered with\n\
10100codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010101
10102static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010103unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010105 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 char *encoding = NULL;
10107 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010108
Benjamin Peterson308d6372009-09-18 21:42:35 +000010109 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10110 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010112 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010113}
10114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010115PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010116 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117\n\
10118Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010119If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120
10121static PyObject*
10122unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10123{
10124 Py_UNICODE *e;
10125 Py_UNICODE *p;
10126 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010127 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129 PyUnicodeObject *u;
10130 int tabsize = 8;
10131
10132 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010133 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10136 return NULL;
10137
Thomas Wouters7e474022000-07-16 12:04:32 +000010138 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010139 i = 0; /* chars up to and including most recent \n or \r */
10140 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10142 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010144 if (tabsize > 0) {
10145 incr = tabsize - (j % tabsize); /* cannot overflow */
10146 if (j > PY_SSIZE_T_MAX - incr)
10147 goto overflow1;
10148 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010149 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 if (j > PY_SSIZE_T_MAX - 1)
10153 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154 j++;
10155 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 if (i > PY_SSIZE_T_MAX - j)
10157 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010159 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160 }
10161 }
10162
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010163 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010164 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010165
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166 /* Second pass: create output string and fill it */
10167 u = _PyUnicode_New(i + j);
10168 if (!u)
10169 return NULL;
10170
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010171 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 q = _PyUnicode_WSTR(u); /* next output char */
10173 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010177 if (tabsize > 0) {
10178 i = tabsize - (j % tabsize);
10179 j += i;
10180 while (i--) {
10181 if (q >= qe)
10182 goto overflow2;
10183 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010184 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010185 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010186 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 else {
10188 if (q >= qe)
10189 goto overflow2;
10190 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010191 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010192 if (*p == '\n' || *p == '\r')
10193 j = 0;
10194 }
10195
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020010196 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 Py_DECREF(u);
10198 return NULL;
10199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010201
10202 overflow2:
10203 Py_DECREF(u);
10204 overflow1:
10205 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207}
10208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010209PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010210 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211\n\
10212Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010213such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214arguments start and end are interpreted as in slice notation.\n\
10215\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010216Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217
10218static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220{
Jesus Ceaac451502011-04-20 17:09:23 +020010221 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010222 Py_ssize_t start;
10223 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Jesus Ceaac451502011-04-20 17:09:23 +020010226 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10227 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (PyUnicode_READY(self) == -1)
10231 return NULL;
10232 if (PyUnicode_READY(substring) == -1)
10233 return NULL;
10234
10235 result = any_find_slice(
10236 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10237 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010238 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
10240 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (result == -2)
10243 return NULL;
10244
Christian Heimes217cfd12007-12-02 14:31:20 +000010245 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246}
10247
10248static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010249unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010251 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10252 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255}
10256
Guido van Rossumc2504932007-09-18 19:42:40 +000010257/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010258 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010259static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010260unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261{
Guido van Rossumc2504932007-09-18 19:42:40 +000010262 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010263 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (_PyUnicode_HASH(self) != -1)
10266 return _PyUnicode_HASH(self);
10267 if (PyUnicode_READY(self) == -1)
10268 return -1;
10269 len = PyUnicode_GET_LENGTH(self);
10270
10271 /* The hash function as a macro, gets expanded three times below. */
10272#define HASH(P) \
10273 x = (Py_uhash_t)*P << 7; \
10274 while (--len >= 0) \
10275 x = (1000003*x) ^ (Py_uhash_t)*P++;
10276
10277 switch (PyUnicode_KIND(self)) {
10278 case PyUnicode_1BYTE_KIND: {
10279 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10280 HASH(c);
10281 break;
10282 }
10283 case PyUnicode_2BYTE_KIND: {
10284 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10285 HASH(s);
10286 break;
10287 }
10288 default: {
10289 Py_UCS4 *l;
10290 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10291 "Impossible switch case in unicode_hash");
10292 l = PyUnicode_4BYTE_DATA(self);
10293 HASH(l);
10294 break;
10295 }
10296 }
10297 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10298
Guido van Rossumc2504932007-09-18 19:42:40 +000010299 if (x == -1)
10300 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010302 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010306PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010309Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310
10311static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010314 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010315 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010316 Py_ssize_t start;
10317 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318
Jesus Ceaac451502011-04-20 17:09:23 +020010319 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10320 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
10325 if (PyUnicode_READY(substring) == -1)
10326 return NULL;
10327
10328 result = any_find_slice(
10329 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10330 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010331 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332
10333 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (result == -2)
10336 return NULL;
10337
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338 if (result < 0) {
10339 PyErr_SetString(PyExc_ValueError, "substring not found");
10340 return NULL;
10341 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010342
Christian Heimes217cfd12007-12-02 14:31:20 +000010343 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344}
10345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010346PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010349Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010350at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
10352static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010353unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 Py_ssize_t i, length;
10356 int kind;
10357 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358 int cased;
10359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (PyUnicode_READY(self) == -1)
10361 return NULL;
10362 length = PyUnicode_GET_LENGTH(self);
10363 kind = PyUnicode_KIND(self);
10364 data = PyUnicode_DATA(self);
10365
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 if (length == 1)
10368 return PyBool_FromLong(
10369 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010371 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010373 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010374
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 for (i = 0; i < length; i++) {
10377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010378
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10380 return PyBool_FromLong(0);
10381 else if (!cased && Py_UNICODE_ISLOWER(ch))
10382 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010384 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385}
10386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010387PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010388 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010390Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010391at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
10393static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010394unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 Py_ssize_t i, length;
10397 int kind;
10398 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 int cased;
10400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 if (PyUnicode_READY(self) == -1)
10402 return NULL;
10403 length = PyUnicode_GET_LENGTH(self);
10404 kind = PyUnicode_KIND(self);
10405 data = PyUnicode_DATA(self);
10406
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 if (length == 1)
10409 return PyBool_FromLong(
10410 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010412 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010415
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 for (i = 0; i < length; i++) {
10418 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010419
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10421 return PyBool_FromLong(0);
10422 else if (!cased && Py_UNICODE_ISUPPER(ch))
10423 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010425 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426}
10427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010428PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010431Return True if S is a titlecased string and there is at least one\n\
10432character in S, i.e. upper- and titlecase characters may only\n\
10433follow uncased characters and lowercase characters only cased ones.\n\
10434Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
10436static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010437unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 Py_ssize_t i, length;
10440 int kind;
10441 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442 int cased, previous_is_cased;
10443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (PyUnicode_READY(self) == -1)
10445 return NULL;
10446 length = PyUnicode_GET_LENGTH(self);
10447 kind = PyUnicode_KIND(self);
10448 data = PyUnicode_DATA(self);
10449
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (length == 1) {
10452 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10453 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10454 (Py_UNICODE_ISUPPER(ch) != 0));
10455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010457 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010460
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461 cased = 0;
10462 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 for (i = 0; i < length; i++) {
10464 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010465
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10467 if (previous_is_cased)
10468 return PyBool_FromLong(0);
10469 previous_is_cased = 1;
10470 cased = 1;
10471 }
10472 else if (Py_UNICODE_ISLOWER(ch)) {
10473 if (!previous_is_cased)
10474 return PyBool_FromLong(0);
10475 previous_is_cased = 1;
10476 cased = 1;
10477 }
10478 else
10479 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010481 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482}
10483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010484PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010485 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010487Return True if all characters in S are whitespace\n\
10488and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489
10490static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010491unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 Py_ssize_t i, length;
10494 int kind;
10495 void *data;
10496
10497 if (PyUnicode_READY(self) == -1)
10498 return NULL;
10499 length = PyUnicode_GET_LENGTH(self);
10500 kind = PyUnicode_KIND(self);
10501 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (length == 1)
10505 return PyBool_FromLong(
10506 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 for (i = 0; i < length; i++) {
10513 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010514 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010515 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010517 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518}
10519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010520PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010522\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010523Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010524and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010525
10526static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010527unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 Py_ssize_t i, length;
10530 int kind;
10531 void *data;
10532
10533 if (PyUnicode_READY(self) == -1)
10534 return NULL;
10535 length = PyUnicode_GET_LENGTH(self);
10536 kind = PyUnicode_KIND(self);
10537 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010538
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010539 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (length == 1)
10541 return PyBool_FromLong(
10542 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010543
10544 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 for (i = 0; i < length; i++) {
10549 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010550 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010551 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010552 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010553}
10554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010555PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010556 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010557\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010558Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010559and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010560
10561static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010562unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 int kind;
10565 void *data;
10566 Py_ssize_t len, i;
10567
10568 if (PyUnicode_READY(self) == -1)
10569 return NULL;
10570
10571 kind = PyUnicode_KIND(self);
10572 data = PyUnicode_DATA(self);
10573 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010574
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010575 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (len == 1) {
10577 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10578 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10579 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010580
10581 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 for (i = 0; i < len; i++) {
10586 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010587 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010589 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010590 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010591}
10592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010593PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010594 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010596Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010597False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598
10599static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010600unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 Py_ssize_t i, length;
10603 int kind;
10604 void *data;
10605
10606 if (PyUnicode_READY(self) == -1)
10607 return NULL;
10608 length = PyUnicode_GET_LENGTH(self);
10609 kind = PyUnicode_KIND(self);
10610 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (length == 1)
10614 return PyBool_FromLong(
10615 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010617 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010619 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 for (i = 0; i < length; i++) {
10622 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010623 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010625 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626}
10627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010628PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010631Return True if all characters in S are digits\n\
10632and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633
10634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010635unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 Py_ssize_t i, length;
10638 int kind;
10639 void *data;
10640
10641 if (PyUnicode_READY(self) == -1)
10642 return NULL;
10643 length = PyUnicode_GET_LENGTH(self);
10644 kind = PyUnicode_KIND(self);
10645 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (length == 1) {
10649 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10650 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010653 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 for (i = 0; i < length; i++) {
10658 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010659 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010661 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662}
10663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010664PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010667Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010668False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
10670static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010671unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 Py_ssize_t i, length;
10674 int kind;
10675 void *data;
10676
10677 if (PyUnicode_READY(self) == -1)
10678 return NULL;
10679 length = PyUnicode_GET_LENGTH(self);
10680 kind = PyUnicode_KIND(self);
10681 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682
Guido van Rossumd57fd912000-03-10 22:53:23 +000010683 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (length == 1)
10685 return PyBool_FromLong(
10686 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010687
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010688 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 for (i = 0; i < length; i++) {
10693 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010694 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010696 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697}
10698
Martin v. Löwis47383402007-08-15 07:32:56 +000010699int
10700PyUnicode_IsIdentifier(PyObject *self)
10701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 int kind;
10703 void *data;
10704 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010705 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 if (PyUnicode_READY(self) == -1) {
10708 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 }
10711
10712 /* Special case for empty strings */
10713 if (PyUnicode_GET_LENGTH(self) == 0)
10714 return 0;
10715 kind = PyUnicode_KIND(self);
10716 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010717
10718 /* PEP 3131 says that the first character must be in
10719 XID_Start and subsequent characters in XID_Continue,
10720 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010721 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010722 letters, digits, underscore). However, given the current
10723 definition of XID_Start and XID_Continue, it is sufficient
10724 to check just for these, except that _ must be allowed
10725 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010727 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010728 return 0;
10729
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010730 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010733 return 1;
10734}
10735
10736PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010737 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010738\n\
10739Return True if S is a valid identifier according\n\
10740to the language definition.");
10741
10742static PyObject*
10743unicode_isidentifier(PyObject *self)
10744{
10745 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10746}
10747
Georg Brandl559e5d72008-06-11 18:37:52 +000010748PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010750\n\
10751Return True if all characters in S are considered\n\
10752printable in repr() or S is empty, False otherwise.");
10753
10754static PyObject*
10755unicode_isprintable(PyObject *self)
10756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 Py_ssize_t i, length;
10758 int kind;
10759 void *data;
10760
10761 if (PyUnicode_READY(self) == -1)
10762 return NULL;
10763 length = PyUnicode_GET_LENGTH(self);
10764 kind = PyUnicode_KIND(self);
10765 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010766
10767 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 if (length == 1)
10769 return PyBool_FromLong(
10770 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 for (i = 0; i < length; i++) {
10773 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010774 Py_RETURN_FALSE;
10775 }
10776 }
10777 Py_RETURN_TRUE;
10778}
10779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010780PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010781 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782\n\
10783Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010784iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
10786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010787unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010789 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790}
10791
Martin v. Löwis18e16552006-02-15 17:27:45 +000010792static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793unicode_length(PyUnicodeObject *self)
10794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 if (PyUnicode_READY(self) == -1)
10796 return -1;
10797 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798}
10799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010800PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010801 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010803Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010804done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805
10806static PyObject *
10807unicode_ljust(PyUnicodeObject *self, PyObject *args)
10808{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010809 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 Py_UCS4 fillchar = ' ';
10811
10812 if (PyUnicode_READY(self) == -1)
10813 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010814
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010815 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816 return NULL;
10817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819 Py_INCREF(self);
10820 return (PyObject*) self;
10821 }
10822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824}
10825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010826PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010829Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
10831static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010832unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 return fixup(self, fixlower);
10835}
10836
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010837#define LEFTSTRIP 0
10838#define RIGHTSTRIP 1
10839#define BOTHSTRIP 2
10840
10841/* Arrays indexed by above */
10842static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10843
10844#define STRIPNAME(i) (stripformat[i]+3)
10845
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010846/* externally visible for str.strip(unicode) */
10847PyObject *
10848_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 void *data;
10851 int kind;
10852 Py_ssize_t i, j, len;
10853 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10856 return NULL;
10857
10858 kind = PyUnicode_KIND(self);
10859 data = PyUnicode_DATA(self);
10860 len = PyUnicode_GET_LENGTH(self);
10861 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10862 PyUnicode_DATA(sepobj),
10863 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864
Benjamin Peterson14339b62009-01-31 16:36:08 +000010865 i = 0;
10866 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 while (i < len &&
10868 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010869 i++;
10870 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010871 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010872
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873 j = len;
10874 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 do {
10876 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 } while (j >= i &&
10878 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010880 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010881
Victor Stinner12bab6d2011-10-01 01:53:49 +020010882 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883}
10884
10885PyObject*
10886PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10887{
10888 unsigned char *data;
10889 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010890 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891
Victor Stinnerde636f32011-10-01 03:55:54 +020010892 if (PyUnicode_READY(self) == -1)
10893 return NULL;
10894
10895 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10896
Victor Stinner12bab6d2011-10-01 01:53:49 +020010897 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010899 if (PyUnicode_CheckExact(self)) {
10900 Py_INCREF(self);
10901 return self;
10902 }
10903 else
10904 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 }
10906
Victor Stinner12bab6d2011-10-01 01:53:49 +020010907 length = end - start;
10908 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010909 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910
Victor Stinnerde636f32011-10-01 03:55:54 +020010911 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010912 PyErr_SetString(PyExc_IndexError, "string index out of range");
10913 return NULL;
10914 }
10915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 kind = PyUnicode_KIND(self);
10917 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010918 return PyUnicode_FromKindAndData(kind,
10919 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010920 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
10923static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010924do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 int kind;
10927 void *data;
10928 Py_ssize_t len, i, j;
10929
10930 if (PyUnicode_READY(self) == -1)
10931 return NULL;
10932
10933 kind = PyUnicode_KIND(self);
10934 data = PyUnicode_DATA(self);
10935 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010936
Benjamin Peterson14339b62009-01-31 16:36:08 +000010937 i = 0;
10938 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010940 i++;
10941 }
10942 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010943
Benjamin Peterson14339b62009-01-31 16:36:08 +000010944 j = len;
10945 if (striptype != LEFTSTRIP) {
10946 do {
10947 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010949 j++;
10950 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010951
Victor Stinner12bab6d2011-10-01 01:53:49 +020010952 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953}
10954
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010955
10956static PyObject *
10957do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10958{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010959 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010960
Benjamin Peterson14339b62009-01-31 16:36:08 +000010961 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10962 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010963
Benjamin Peterson14339b62009-01-31 16:36:08 +000010964 if (sep != NULL && sep != Py_None) {
10965 if (PyUnicode_Check(sep))
10966 return _PyUnicode_XStrip(self, striptype, sep);
10967 else {
10968 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 "%s arg must be None or str",
10970 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010971 return NULL;
10972 }
10973 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010974
Benjamin Peterson14339b62009-01-31 16:36:08 +000010975 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010976}
10977
10978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010981\n\
10982Return a copy of the string S with leading and trailing\n\
10983whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010984If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010985
10986static PyObject *
10987unicode_strip(PyUnicodeObject *self, PyObject *args)
10988{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010989 if (PyTuple_GET_SIZE(args) == 0)
10990 return do_strip(self, BOTHSTRIP); /* Common case */
10991 else
10992 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010993}
10994
10995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010996PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010998\n\
10999Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011000If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011001
11002static PyObject *
11003unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11004{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011005 if (PyTuple_GET_SIZE(args) == 0)
11006 return do_strip(self, LEFTSTRIP); /* Common case */
11007 else
11008 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011009}
11010
11011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011012PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011014\n\
11015Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011016If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011017
11018static PyObject *
11019unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11020{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011021 if (PyTuple_GET_SIZE(args) == 0)
11022 return do_strip(self, RIGHTSTRIP); /* Common case */
11023 else
11024 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011025}
11026
11027
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011029unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030{
11031 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
Georg Brandl222de0f2009-04-12 12:01:50 +000011034 if (len < 1) {
11035 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011036 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038
Tim Peters7a29bd52001-09-12 03:03:31 +000011039 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 /* no repeat, return original string */
11041 Py_INCREF(str);
11042 return (PyObject*) str;
11043 }
Tim Peters8f422462000-09-09 06:13:41 +000011044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (PyUnicode_READY(str) == -1)
11046 return NULL;
11047
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011048 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011049 PyErr_SetString(PyExc_OverflowError,
11050 "repeated string is too long");
11051 return NULL;
11052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 if (!u)
11057 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011058 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 if (PyUnicode_GET_LENGTH(str) == 1) {
11061 const int kind = PyUnicode_KIND(str);
11062 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11063 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011064 if (kind == PyUnicode_1BYTE_KIND)
11065 memset(to, (unsigned char)fill_char, len);
11066 else {
11067 for (n = 0; n < len; ++n)
11068 PyUnicode_WRITE(kind, to, n, fill_char);
11069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 }
11071 else {
11072 /* number of characters copied this far */
11073 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11074 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11075 char *to = (char *) PyUnicode_DATA(u);
11076 Py_MEMCPY(to, PyUnicode_DATA(str),
11077 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 n = (done <= nchars-done) ? done : nchars-done;
11080 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083 }
11084
11085 return (PyObject*) u;
11086}
11087
Alexander Belopolsky40018472011-02-26 01:02:56 +000011088PyObject *
11089PyUnicode_Replace(PyObject *obj,
11090 PyObject *subobj,
11091 PyObject *replobj,
11092 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093{
11094 PyObject *self;
11095 PyObject *str1;
11096 PyObject *str2;
11097 PyObject *result;
11098
11099 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011100 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011103 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 Py_DECREF(self);
11105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106 }
11107 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011108 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 Py_DECREF(self);
11110 Py_DECREF(str1);
11111 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 Py_DECREF(self);
11115 Py_DECREF(str1);
11116 Py_DECREF(str2);
11117 return result;
11118}
11119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011121 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122\n\
11123Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011124old replaced by new. If the optional argument count is\n\
11125given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
11127static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 PyObject *str1;
11131 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011132 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133 PyObject *result;
11134
Martin v. Löwis18e16552006-02-15 17:27:45 +000011135 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011138 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 str1 = PyUnicode_FromObject(str1);
11140 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11141 return NULL;
11142 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011143 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 Py_DECREF(str1);
11145 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
11148 result = replace(self, str1, str2, maxcount);
11149
11150 Py_DECREF(str1);
11151 Py_DECREF(str2);
11152 return result;
11153}
11154
Alexander Belopolsky40018472011-02-26 01:02:56 +000011155static PyObject *
11156unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011158 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 Py_ssize_t isize;
11160 Py_ssize_t osize, squote, dquote, i, o;
11161 Py_UCS4 max, quote;
11162 int ikind, okind;
11163 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011166 return NULL;
11167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 isize = PyUnicode_GET_LENGTH(unicode);
11169 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 /* Compute length of output, quote characters, and
11172 maximum character */
11173 osize = 2; /* quotes */
11174 max = 127;
11175 squote = dquote = 0;
11176 ikind = PyUnicode_KIND(unicode);
11177 for (i = 0; i < isize; i++) {
11178 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11179 switch (ch) {
11180 case '\'': squote++; osize++; break;
11181 case '"': dquote++; osize++; break;
11182 case '\\': case '\t': case '\r': case '\n':
11183 osize += 2; break;
11184 default:
11185 /* Fast-path ASCII */
11186 if (ch < ' ' || ch == 0x7f)
11187 osize += 4; /* \xHH */
11188 else if (ch < 0x7f)
11189 osize++;
11190 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11191 osize++;
11192 max = ch > max ? ch : max;
11193 }
11194 else if (ch < 0x100)
11195 osize += 4; /* \xHH */
11196 else if (ch < 0x10000)
11197 osize += 6; /* \uHHHH */
11198 else
11199 osize += 10; /* \uHHHHHHHH */
11200 }
11201 }
11202
11203 quote = '\'';
11204 if (squote) {
11205 if (dquote)
11206 /* Both squote and dquote present. Use squote,
11207 and escape them */
11208 osize += squote;
11209 else
11210 quote = '"';
11211 }
11212
11213 repr = PyUnicode_New(osize, max);
11214 if (repr == NULL)
11215 return NULL;
11216 okind = PyUnicode_KIND(repr);
11217 odata = PyUnicode_DATA(repr);
11218
11219 PyUnicode_WRITE(okind, odata, 0, quote);
11220 PyUnicode_WRITE(okind, odata, osize-1, quote);
11221
11222 for (i = 0, o = 1; i < isize; i++) {
11223 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011224
11225 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 if ((ch == quote) || (ch == '\\')) {
11227 PyUnicode_WRITE(okind, odata, o++, '\\');
11228 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011229 continue;
11230 }
11231
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011233 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 PyUnicode_WRITE(okind, odata, o++, '\\');
11235 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011236 }
11237 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 PyUnicode_WRITE(okind, odata, o++, '\\');
11239 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011240 }
11241 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 PyUnicode_WRITE(okind, odata, o++, '\\');
11243 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011244 }
11245
11246 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011247 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 PyUnicode_WRITE(okind, odata, o++, '\\');
11249 PyUnicode_WRITE(okind, odata, o++, 'x');
11250 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11251 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011252 }
11253
Georg Brandl559e5d72008-06-11 18:37:52 +000011254 /* Copy ASCII characters as-is */
11255 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011257 }
11258
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011260 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011261 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011262 (categories Z* and C* except ASCII space)
11263 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011265 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 if (ch <= 0xff) {
11267 PyUnicode_WRITE(okind, odata, o++, '\\');
11268 PyUnicode_WRITE(okind, odata, o++, 'x');
11269 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11270 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011271 }
11272 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 else if (ch >= 0x10000) {
11274 PyUnicode_WRITE(okind, odata, o++, '\\');
11275 PyUnicode_WRITE(okind, odata, o++, 'U');
11276 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11277 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11278 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11279 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11280 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11281 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11282 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11283 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011284 }
11285 /* Map 16-bit characters to '\uxxxx' */
11286 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 PyUnicode_WRITE(okind, odata, o++, '\\');
11288 PyUnicode_WRITE(okind, odata, o++, 'u');
11289 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11290 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11291 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11292 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011293 }
11294 }
11295 /* Copy characters as-is */
11296 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011298 }
11299 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011302 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307\n\
11308Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011309such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310arguments start and end are interpreted as in slice notation.\n\
11311\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011312Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
Jesus Ceaac451502011-04-20 17:09:23 +020011317 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011318 Py_ssize_t start;
11319 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011320 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
Jesus Ceaac451502011-04-20 17:09:23 +020011322 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11323 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (PyUnicode_READY(self) == -1)
11327 return NULL;
11328 if (PyUnicode_READY(substring) == -1)
11329 return NULL;
11330
11331 result = any_find_slice(
11332 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11333 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011334 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
11336 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (result == -2)
11339 return NULL;
11340
Christian Heimes217cfd12007-12-02 14:31:20 +000011341 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011344PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011347Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
11349static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351{
Jesus Ceaac451502011-04-20 17:09:23 +020011352 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011353 Py_ssize_t start;
11354 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011355 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
Jesus Ceaac451502011-04-20 17:09:23 +020011357 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11358 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (PyUnicode_READY(self) == -1)
11362 return NULL;
11363 if (PyUnicode_READY(substring) == -1)
11364 return NULL;
11365
11366 result = any_find_slice(
11367 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11368 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011369 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (result == -2)
11374 return NULL;
11375
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 if (result < 0) {
11377 PyErr_SetString(PyExc_ValueError, "substring not found");
11378 return NULL;
11379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380
Christian Heimes217cfd12007-12-02 14:31:20 +000011381 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011387Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011388done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389
11390static PyObject *
11391unicode_rjust(PyUnicodeObject *self, PyObject *args)
11392{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011393 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 Py_UCS4 fillchar = ' ';
11395
Victor Stinnere9a29352011-10-01 02:14:59 +020011396 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011398
Victor Stinnere9a29352011-10-01 02:14:59 +020011399 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400 return NULL;
11401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403 Py_INCREF(self);
11404 return (PyObject*) self;
11405 }
11406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408}
11409
Alexander Belopolsky40018472011-02-26 01:02:56 +000011410PyObject *
11411PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412{
11413 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011414
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415 s = PyUnicode_FromObject(s);
11416 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011417 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 if (sep != NULL) {
11419 sep = PyUnicode_FromObject(sep);
11420 if (sep == NULL) {
11421 Py_DECREF(s);
11422 return NULL;
11423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 }
11425
11426 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11427
11428 Py_DECREF(s);
11429 Py_XDECREF(sep);
11430 return result;
11431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
11436Return a list of the words in S, using sep as the\n\
11437delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011438splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011439whitespace string is a separator and empty strings are\n\
11440removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject*
11443unicode_split(PyUnicodeObject *self, PyObject *args)
11444{
11445 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011446 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
Martin v. Löwis18e16552006-02-15 17:27:45 +000011448 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 return NULL;
11450
11451 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457}
11458
Thomas Wouters477c8d52006-05-27 19:21:47 +000011459PyObject *
11460PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11461{
11462 PyObject* str_obj;
11463 PyObject* sep_obj;
11464 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 int kind1, kind2, kind;
11466 void *buf1 = NULL, *buf2 = NULL;
11467 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011468
11469 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011470 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011472 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011474 Py_DECREF(str_obj);
11475 return NULL;
11476 }
11477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 kind1 = PyUnicode_KIND(str_in);
11479 kind2 = PyUnicode_KIND(sep_obj);
11480 kind = kind1 > kind2 ? kind1 : kind2;
11481 buf1 = PyUnicode_DATA(str_in);
11482 if (kind1 != kind)
11483 buf1 = _PyUnicode_AsKind(str_in, kind);
11484 if (!buf1)
11485 goto onError;
11486 buf2 = PyUnicode_DATA(sep_obj);
11487 if (kind2 != kind)
11488 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11489 if (!buf2)
11490 goto onError;
11491 len1 = PyUnicode_GET_LENGTH(str_obj);
11492 len2 = PyUnicode_GET_LENGTH(sep_obj);
11493
11494 switch(PyUnicode_KIND(str_in)) {
11495 case PyUnicode_1BYTE_KIND:
11496 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11497 break;
11498 case PyUnicode_2BYTE_KIND:
11499 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11500 break;
11501 case PyUnicode_4BYTE_KIND:
11502 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11503 break;
11504 default:
11505 assert(0);
11506 out = 0;
11507 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011508
11509 Py_DECREF(sep_obj);
11510 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (kind1 != kind)
11512 PyMem_Free(buf1);
11513 if (kind2 != kind)
11514 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515
11516 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 onError:
11518 Py_DECREF(sep_obj);
11519 Py_DECREF(str_obj);
11520 if (kind1 != kind && buf1)
11521 PyMem_Free(buf1);
11522 if (kind2 != kind && buf2)
11523 PyMem_Free(buf2);
11524 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011525}
11526
11527
11528PyObject *
11529PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11530{
11531 PyObject* str_obj;
11532 PyObject* sep_obj;
11533 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 int kind1, kind2, kind;
11535 void *buf1 = NULL, *buf2 = NULL;
11536 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011537
11538 str_obj = PyUnicode_FromObject(str_in);
11539 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011540 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011541 sep_obj = PyUnicode_FromObject(sep_in);
11542 if (!sep_obj) {
11543 Py_DECREF(str_obj);
11544 return NULL;
11545 }
11546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 kind1 = PyUnicode_KIND(str_in);
11548 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011549 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 buf1 = PyUnicode_DATA(str_in);
11551 if (kind1 != kind)
11552 buf1 = _PyUnicode_AsKind(str_in, kind);
11553 if (!buf1)
11554 goto onError;
11555 buf2 = PyUnicode_DATA(sep_obj);
11556 if (kind2 != kind)
11557 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11558 if (!buf2)
11559 goto onError;
11560 len1 = PyUnicode_GET_LENGTH(str_obj);
11561 len2 = PyUnicode_GET_LENGTH(sep_obj);
11562
11563 switch(PyUnicode_KIND(str_in)) {
11564 case PyUnicode_1BYTE_KIND:
11565 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11566 break;
11567 case PyUnicode_2BYTE_KIND:
11568 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11569 break;
11570 case PyUnicode_4BYTE_KIND:
11571 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11572 break;
11573 default:
11574 assert(0);
11575 out = 0;
11576 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011577
11578 Py_DECREF(sep_obj);
11579 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (kind1 != kind)
11581 PyMem_Free(buf1);
11582 if (kind2 != kind)
11583 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011584
11585 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 onError:
11587 Py_DECREF(sep_obj);
11588 Py_DECREF(str_obj);
11589 if (kind1 != kind && buf1)
11590 PyMem_Free(buf1);
11591 if (kind2 != kind && buf2)
11592 PyMem_Free(buf2);
11593 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011594}
11595
11596PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011598\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011599Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011600the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011601found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011602
11603static PyObject*
11604unicode_partition(PyUnicodeObject *self, PyObject *separator)
11605{
11606 return PyUnicode_Partition((PyObject *)self, separator);
11607}
11608
11609PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011610 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011611\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011612Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011613the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011614separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011615
11616static PyObject*
11617unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11618{
11619 return PyUnicode_RPartition((PyObject *)self, separator);
11620}
11621
Alexander Belopolsky40018472011-02-26 01:02:56 +000011622PyObject *
11623PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011624{
11625 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011626
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011627 s = PyUnicode_FromObject(s);
11628 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011629 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 if (sep != NULL) {
11631 sep = PyUnicode_FromObject(sep);
11632 if (sep == NULL) {
11633 Py_DECREF(s);
11634 return NULL;
11635 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011636 }
11637
11638 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11639
11640 Py_DECREF(s);
11641 Py_XDECREF(sep);
11642 return result;
11643}
11644
11645PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011647\n\
11648Return a list of the words in S, using sep as the\n\
11649delimiter string, starting at the end of the string and\n\
11650working to the front. If maxsplit is given, at most maxsplit\n\
11651splits are done. If sep is not specified, any whitespace string\n\
11652is a separator.");
11653
11654static PyObject*
11655unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11656{
11657 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011658 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011659
Martin v. Löwis18e16552006-02-15 17:27:45 +000011660 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011661 return NULL;
11662
11663 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011664 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011665 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011667 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673\n\
11674Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011675Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
11678static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011679unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011681 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011682 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011684 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11685 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 return NULL;
11687
Guido van Rossum86662912000-04-11 15:38:46 +000011688 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
11691static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011692PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693{
Walter Dörwald346737f2007-05-31 10:44:43 +000011694 if (PyUnicode_CheckExact(self)) {
11695 Py_INCREF(self);
11696 return self;
11697 } else
11698 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011699 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700}
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704\n\
11705Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011706and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
11708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011709unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 return fixup(self, fixswapcase);
11712}
11713
Georg Brandlceee0772007-11-27 23:48:05 +000011714PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011716\n\
11717Return a translation table usable for str.translate().\n\
11718If there is only one argument, it must be a dictionary mapping Unicode\n\
11719ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011720Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011721If there are two arguments, they must be strings of equal length, and\n\
11722in the resulting dictionary, each character in x will be mapped to the\n\
11723character at the same position in y. If there is a third argument, it\n\
11724must be a string, whose characters will be mapped to None in the result.");
11725
11726static PyObject*
11727unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11728{
11729 PyObject *x, *y = NULL, *z = NULL;
11730 PyObject *new = NULL, *key, *value;
11731 Py_ssize_t i = 0;
11732 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011733
Georg Brandlceee0772007-11-27 23:48:05 +000011734 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11735 return NULL;
11736 new = PyDict_New();
11737 if (!new)
11738 return NULL;
11739 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 int x_kind, y_kind, z_kind;
11741 void *x_data, *y_data, *z_data;
11742
Georg Brandlceee0772007-11-27 23:48:05 +000011743 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011744 if (!PyUnicode_Check(x)) {
11745 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11746 "be a string if there is a second argument");
11747 goto err;
11748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011750 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11751 "arguments must have equal length");
11752 goto err;
11753 }
11754 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 x_kind = PyUnicode_KIND(x);
11756 y_kind = PyUnicode_KIND(y);
11757 x_data = PyUnicode_DATA(x);
11758 y_data = PyUnicode_DATA(y);
11759 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11760 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11761 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011762 if (!key || !value)
11763 goto err;
11764 res = PyDict_SetItem(new, key, value);
11765 Py_DECREF(key);
11766 Py_DECREF(value);
11767 if (res < 0)
11768 goto err;
11769 }
11770 /* create entries for deleting chars in z */
11771 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 z_kind = PyUnicode_KIND(z);
11773 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011774 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011776 if (!key)
11777 goto err;
11778 res = PyDict_SetItem(new, key, Py_None);
11779 Py_DECREF(key);
11780 if (res < 0)
11781 goto err;
11782 }
11783 }
11784 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 int kind;
11786 void *data;
11787
Georg Brandlceee0772007-11-27 23:48:05 +000011788 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011789 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011790 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11791 "to maketrans it must be a dict");
11792 goto err;
11793 }
11794 /* copy entries into the new dict, converting string keys to int keys */
11795 while (PyDict_Next(x, &i, &key, &value)) {
11796 if (PyUnicode_Check(key)) {
11797 /* convert string keys to integer keys */
11798 PyObject *newkey;
11799 if (PyUnicode_GET_SIZE(key) != 1) {
11800 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11801 "table must be of length 1");
11802 goto err;
11803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 kind = PyUnicode_KIND(key);
11805 data = PyUnicode_DATA(key);
11806 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011807 if (!newkey)
11808 goto err;
11809 res = PyDict_SetItem(new, newkey, value);
11810 Py_DECREF(newkey);
11811 if (res < 0)
11812 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011813 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011814 /* just keep integer keys */
11815 if (PyDict_SetItem(new, key, value) < 0)
11816 goto err;
11817 } else {
11818 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11819 "be strings or integers");
11820 goto err;
11821 }
11822 }
11823 }
11824 return new;
11825 err:
11826 Py_DECREF(new);
11827 return NULL;
11828}
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832\n\
11833Return a copy of the string S, where all characters have been mapped\n\
11834through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011835Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011836Unmapped characters are left untouched. Characters mapped to None\n\
11837are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
11839static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843}
11844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011845PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
11850static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011851unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 return fixup(self, fixupper);
11854}
11855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011856PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011859Pad a numeric string S with zeros on the left, to fill a field\n\
11860of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861
11862static PyObject *
11863unicode_zfill(PyUnicodeObject *self, PyObject *args)
11864{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011865 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011867 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 int kind;
11869 void *data;
11870 Py_UCS4 chr;
11871
11872 if (PyUnicode_READY(self) == -1)
11873 return NULL;
11874
Martin v. Löwis18e16552006-02-15 17:27:45 +000011875 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 return NULL;
11877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011879 if (PyUnicode_CheckExact(self)) {
11880 Py_INCREF(self);
11881 return (PyObject*) self;
11882 }
11883 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011884 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 }
11886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
11889 u = pad(self, fill, 0, '0');
11890
Walter Dörwald068325e2002-04-15 13:36:47 +000011891 if (u == NULL)
11892 return NULL;
11893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 kind = PyUnicode_KIND(u);
11895 data = PyUnicode_DATA(u);
11896 chr = PyUnicode_READ(kind, data, fill);
11897
11898 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 PyUnicode_WRITE(kind, data, 0, chr);
11901 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 }
11903
11904 return (PyObject*) u;
11905}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
11907#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011908static PyObject *
11909unicode__decimal2ascii(PyObject *self)
11910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011912}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913#endif
11914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011915PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011918Return True if S starts with the specified prefix, False otherwise.\n\
11919With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011920With optional end, stop comparing S at that position.\n\
11921prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
11923static PyObject *
11924unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011927 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011929 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011930 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011931 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
Jesus Ceaac451502011-04-20 17:09:23 +020011933 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011935 if (PyTuple_Check(subobj)) {
11936 Py_ssize_t i;
11937 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11938 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011940 if (substring == NULL)
11941 return NULL;
11942 result = tailmatch(self, substring, start, end, -1);
11943 Py_DECREF(substring);
11944 if (result) {
11945 Py_RETURN_TRUE;
11946 }
11947 }
11948 /* nothing matched */
11949 Py_RETURN_FALSE;
11950 }
11951 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011952 if (substring == NULL) {
11953 if (PyErr_ExceptionMatches(PyExc_TypeError))
11954 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11955 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011957 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011958 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011960 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961}
11962
11963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011964PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011967Return True if S ends with the specified suffix, False otherwise.\n\
11968With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011969With optional end, stop comparing S at that position.\n\
11970suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
11972static PyObject *
11973unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011976 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011978 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011979 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011980 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
Jesus Ceaac451502011-04-20 17:09:23 +020011982 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011984 if (PyTuple_Check(subobj)) {
11985 Py_ssize_t i;
11986 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11987 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011989 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011991 result = tailmatch(self, substring, start, end, +1);
11992 Py_DECREF(substring);
11993 if (result) {
11994 Py_RETURN_TRUE;
11995 }
11996 }
11997 Py_RETURN_FALSE;
11998 }
11999 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012000 if (substring == NULL) {
12001 if (PyErr_ExceptionMatches(PyExc_TypeError))
12002 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12003 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012004 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012005 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012006 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012008 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009}
12010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012012
12013PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012015\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012016Return a formatted version of S, using substitutions from args and kwargs.\n\
12017The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012018
Eric Smith27bbca62010-11-04 17:06:58 +000012019PyDoc_STRVAR(format_map__doc__,
12020 "S.format_map(mapping) -> str\n\
12021\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012022Return a formatted version of S, using substitutions from mapping.\n\
12023The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012024
Eric Smith4a7d76d2008-05-30 18:10:19 +000012025static PyObject *
12026unicode__format__(PyObject* self, PyObject* args)
12027{
12028 PyObject *format_spec;
12029
12030 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12031 return NULL;
12032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12034 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012035}
12036
Eric Smith8c663262007-08-25 02:26:07 +000012037PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012038 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012039\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012040Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012041
12042static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012043unicode__sizeof__(PyUnicodeObject *v)
12044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 Py_ssize_t size;
12046
12047 /* If it's a compact object, account for base structure +
12048 character data. */
12049 if (PyUnicode_IS_COMPACT_ASCII(v))
12050 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12051 else if (PyUnicode_IS_COMPACT(v))
12052 size = sizeof(PyCompactUnicodeObject) +
12053 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12054 else {
12055 /* If it is a two-block object, account for base object, and
12056 for character block if present. */
12057 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012058 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 size += (PyUnicode_GET_LENGTH(v) + 1) *
12060 PyUnicode_CHARACTER_SIZE(v);
12061 }
12062 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012063 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020012065 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012067 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012068 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069
12070 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012071}
12072
12073PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012074 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012075
12076static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012077unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012078{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012079 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 if (!copy)
12081 return NULL;
12082 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012083}
12084
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085static PyMethodDef unicode_methods[] = {
12086
12087 /* Order is according to common usage: often used methods should
12088 appear first, since lookup is done sequentially. */
12089
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012090 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012091 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12092 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012093 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012094 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12095 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12096 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12097 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12098 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12099 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12100 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012101 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012102 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12103 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12104 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012106 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12107 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12108 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012109 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012110 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012111 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012112 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012113 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12114 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12115 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12116 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12117 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12118 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12119 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12120 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12121 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12122 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12123 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12124 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12125 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12126 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012127 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012128 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012129 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012130 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012131 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012132 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012133 {"maketrans", (PyCFunction) unicode_maketrans,
12134 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012135 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012136#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012137 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138#endif
12139
12140#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012141 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012142 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143#endif
12144
Benjamin Peterson14339b62009-01-31 16:36:08 +000012145 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146 {NULL, NULL}
12147};
12148
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012149static PyObject *
12150unicode_mod(PyObject *v, PyObject *w)
12151{
Brian Curtindfc80e32011-08-10 20:28:54 -050012152 if (!PyUnicode_Check(v))
12153 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012155}
12156
12157static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012158 0, /*nb_add*/
12159 0, /*nb_subtract*/
12160 0, /*nb_multiply*/
12161 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012162};
12163
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 (lenfunc) unicode_length, /* sq_length */
12166 PyUnicode_Concat, /* sq_concat */
12167 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12168 (ssizeargfunc) unicode_getitem, /* sq_item */
12169 0, /* sq_slice */
12170 0, /* sq_ass_item */
12171 0, /* sq_ass_slice */
12172 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173};
12174
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012175static PyObject*
12176unicode_subscript(PyUnicodeObject* self, PyObject* item)
12177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (PyUnicode_READY(self) == -1)
12179 return NULL;
12180
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012181 if (PyIndex_Check(item)) {
12182 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012183 if (i == -1 && PyErr_Occurred())
12184 return NULL;
12185 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012187 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012188 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012189 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012191 Py_UNICODE* result_buf;
12192 PyObject* result;
12193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012196 return NULL;
12197 }
12198
12199 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 return PyUnicode_New(0, 0);
12201 } else if (start == 0 && step == 1 &&
12202 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012203 PyUnicode_CheckExact(self)) {
12204 Py_INCREF(self);
12205 return (PyObject *)self;
12206 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012207 return PyUnicode_Substring((PyObject*)self,
12208 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012209 } else {
12210 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012211 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12212 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012213
Benjamin Peterson29060642009-01-31 22:14:21 +000012214 if (result_buf == NULL)
12215 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012216
12217 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12218 result_buf[i] = source_buf[cur];
12219 }
Tim Petersced69f82003-09-16 20:30:58 +000012220
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012221 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012222 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012223 return result;
12224 }
12225 } else {
12226 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12227 return NULL;
12228 }
12229}
12230
12231static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012232 (lenfunc)unicode_length, /* mp_length */
12233 (binaryfunc)unicode_subscript, /* mp_subscript */
12234 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012235};
12236
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238/* Helpers for PyUnicode_Format() */
12239
12240static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012241getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012243 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 (*p_argidx)++;
12246 if (arglen < 0)
12247 return args;
12248 else
12249 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 }
12251 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253 return NULL;
12254}
12255
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012256/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012258static PyObject *
12259formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012261 char *p;
12262 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012264
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 x = PyFloat_AsDouble(v);
12266 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012267 return NULL;
12268
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012271
Eric Smith0923d1d2009-04-16 20:16:10 +000012272 p = PyOS_double_to_string(x, type, prec,
12273 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012274 if (p == NULL)
12275 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012277 PyMem_Free(p);
12278 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279}
12280
Tim Peters38fd5b62000-09-21 05:43:11 +000012281static PyObject*
12282formatlong(PyObject *val, int flags, int prec, int type)
12283{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012284 char *buf;
12285 int len;
12286 PyObject *str; /* temporary string object. */
12287 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012288
Benjamin Peterson14339b62009-01-31 16:36:08 +000012289 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12290 if (!str)
12291 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012293 Py_DECREF(str);
12294 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012295}
12296
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012299 size_t buflen,
12300 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012302 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012303 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (PyUnicode_GET_LENGTH(v) == 1) {
12305 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 buf[1] = '\0';
12307 return 1;
12308 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 goto onError;
12310 }
12311 else {
12312 /* Integer input truncated to a character */
12313 long x;
12314 x = PyLong_AsLong(v);
12315 if (x == -1 && PyErr_Occurred())
12316 goto onError;
12317
12318 if (x < 0 || x > 0x10ffff) {
12319 PyErr_SetString(PyExc_OverflowError,
12320 "%c arg not in range(0x110000)");
12321 return -1;
12322 }
12323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012325 buf[1] = '\0';
12326 return 1;
12327 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012328
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012330 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012332 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333}
12334
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012335/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012336 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012337*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012338#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012339
Alexander Belopolsky40018472011-02-26 01:02:56 +000012340PyObject *
12341PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 void *fmt;
12344 int fmtkind;
12345 PyObject *result;
12346 Py_UCS4 *res, *res0;
12347 Py_UCS4 max;
12348 int kind;
12349 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012353
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 PyErr_BadInternalCall();
12356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12359 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 fmt = PyUnicode_DATA(uformat);
12362 fmtkind = PyUnicode_KIND(uformat);
12363 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12364 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365
12366 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12368 if (res0 == NULL) {
12369 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372
12373 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012374 arglen = PyTuple_Size(args);
12375 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 }
12377 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 arglen = -1;
12379 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012381 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012382 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384
12385 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 if (--rescnt < 0) {
12388 rescnt = fmtcnt + 100;
12389 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12391 if (res0 == NULL){
12392 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 }
12395 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 }
12400 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 /* Got a format specifier */
12402 int flags = 0;
12403 Py_ssize_t width = -1;
12404 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 Py_UCS4 c = '\0';
12406 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 int isnumok;
12408 PyObject *v = NULL;
12409 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 void *pbuf;
12411 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 Py_ssize_t len, len1;
12414 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 fmtpos++;
12417 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12418 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 Py_ssize_t keylen;
12420 PyObject *key;
12421 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012422
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 if (dict == NULL) {
12424 PyErr_SetString(PyExc_TypeError,
12425 "format requires a mapping");
12426 goto onError;
12427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012429 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 /* Skip over balanced parentheses */
12432 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012436 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 if (fmtcnt < 0 || pcount > 0) {
12441 PyErr_SetString(PyExc_ValueError,
12442 "incomplete format key");
12443 goto onError;
12444 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012445 key = PyUnicode_Substring((PyObject*)uformat,
12446 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012447 if (key == NULL)
12448 goto onError;
12449 if (args_owned) {
12450 Py_DECREF(args);
12451 args_owned = 0;
12452 }
12453 args = PyObject_GetItem(dict, key);
12454 Py_DECREF(key);
12455 if (args == NULL) {
12456 goto onError;
12457 }
12458 args_owned = 1;
12459 arglen = -1;
12460 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012464 case '-': flags |= F_LJUST; continue;
12465 case '+': flags |= F_SIGN; continue;
12466 case ' ': flags |= F_BLANK; continue;
12467 case '#': flags |= F_ALT; continue;
12468 case '0': flags |= F_ZERO; continue;
12469 }
12470 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 if (c == '*') {
12473 v = getnextarg(args, arglen, &argidx);
12474 if (v == NULL)
12475 goto onError;
12476 if (!PyLong_Check(v)) {
12477 PyErr_SetString(PyExc_TypeError,
12478 "* wants int");
12479 goto onError;
12480 }
12481 width = PyLong_AsLong(v);
12482 if (width == -1 && PyErr_Occurred())
12483 goto onError;
12484 if (width < 0) {
12485 flags |= F_LJUST;
12486 width = -width;
12487 }
12488 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 }
12491 else if (c >= '0' && c <= '9') {
12492 width = c - '0';
12493 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 if (c < '0' || c > '9')
12496 break;
12497 if ((width*10) / 10 != width) {
12498 PyErr_SetString(PyExc_ValueError,
12499 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 }
12502 width = width*10 + (c - '0');
12503 }
12504 }
12505 if (c == '.') {
12506 prec = 0;
12507 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012509 if (c == '*') {
12510 v = getnextarg(args, arglen, &argidx);
12511 if (v == NULL)
12512 goto onError;
12513 if (!PyLong_Check(v)) {
12514 PyErr_SetString(PyExc_TypeError,
12515 "* wants int");
12516 goto onError;
12517 }
12518 prec = PyLong_AsLong(v);
12519 if (prec == -1 && PyErr_Occurred())
12520 goto onError;
12521 if (prec < 0)
12522 prec = 0;
12523 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 }
12526 else if (c >= '0' && c <= '9') {
12527 prec = c - '0';
12528 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 if (c < '0' || c > '9')
12531 break;
12532 if ((prec*10) / 10 != prec) {
12533 PyErr_SetString(PyExc_ValueError,
12534 "prec too big");
12535 goto onError;
12536 }
12537 prec = prec*10 + (c - '0');
12538 }
12539 }
12540 } /* prec */
12541 if (fmtcnt >= 0) {
12542 if (c == 'h' || c == 'l' || c == 'L') {
12543 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012545 }
12546 }
12547 if (fmtcnt < 0) {
12548 PyErr_SetString(PyExc_ValueError,
12549 "incomplete format");
12550 goto onError;
12551 }
12552 if (c != '%') {
12553 v = getnextarg(args, arglen, &argidx);
12554 if (v == NULL)
12555 goto onError;
12556 }
12557 sign = 0;
12558 fill = ' ';
12559 switch (c) {
12560
12561 case '%':
12562 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012566 len = 1;
12567 break;
12568
12569 case 's':
12570 case 'r':
12571 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012572 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 temp = v;
12574 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012575 }
12576 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 if (c == 's')
12578 temp = PyObject_Str(v);
12579 else if (c == 'r')
12580 temp = PyObject_Repr(v);
12581 else
12582 temp = PyObject_ASCII(v);
12583 if (temp == NULL)
12584 goto onError;
12585 if (PyUnicode_Check(temp))
12586 /* nothing to do */;
12587 else {
12588 Py_DECREF(temp);
12589 PyErr_SetString(PyExc_TypeError,
12590 "%s argument has non-string str()");
12591 goto onError;
12592 }
12593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 if (PyUnicode_READY(temp) == -1) {
12595 Py_CLEAR(temp);
12596 goto onError;
12597 }
12598 pbuf = PyUnicode_DATA(temp);
12599 kind = PyUnicode_KIND(temp);
12600 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012601 if (prec >= 0 && len > prec)
12602 len = prec;
12603 break;
12604
12605 case 'i':
12606 case 'd':
12607 case 'u':
12608 case 'o':
12609 case 'x':
12610 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 isnumok = 0;
12612 if (PyNumber_Check(v)) {
12613 PyObject *iobj=NULL;
12614
12615 if (PyLong_Check(v)) {
12616 iobj = v;
12617 Py_INCREF(iobj);
12618 }
12619 else {
12620 iobj = PyNumber_Long(v);
12621 }
12622 if (iobj!=NULL) {
12623 if (PyLong_Check(iobj)) {
12624 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012625 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 Py_DECREF(iobj);
12627 if (!temp)
12628 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 if (PyUnicode_READY(temp) == -1) {
12630 Py_CLEAR(temp);
12631 goto onError;
12632 }
12633 pbuf = PyUnicode_DATA(temp);
12634 kind = PyUnicode_KIND(temp);
12635 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 sign = 1;
12637 }
12638 else {
12639 Py_DECREF(iobj);
12640 }
12641 }
12642 }
12643 if (!isnumok) {
12644 PyErr_Format(PyExc_TypeError,
12645 "%%%c format: a number is required, "
12646 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12647 goto onError;
12648 }
12649 if (flags & F_ZERO)
12650 fill = '0';
12651 break;
12652
12653 case 'e':
12654 case 'E':
12655 case 'f':
12656 case 'F':
12657 case 'g':
12658 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012659 temp = formatfloat(v, flags, prec, c);
12660 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 if (PyUnicode_READY(temp) == -1) {
12663 Py_CLEAR(temp);
12664 goto onError;
12665 }
12666 pbuf = PyUnicode_DATA(temp);
12667 kind = PyUnicode_KIND(temp);
12668 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 sign = 1;
12670 if (flags & F_ZERO)
12671 fill = '0';
12672 break;
12673
12674 case 'c':
12675 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012677 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 if (len < 0)
12679 goto onError;
12680 break;
12681
12682 default:
12683 PyErr_Format(PyExc_ValueError,
12684 "unsupported format character '%c' (0x%x) "
12685 "at index %zd",
12686 (31<=c && c<=126) ? (char)c : '?',
12687 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 goto onError;
12690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 /* pbuf is initialized here. */
12692 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12695 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12696 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 len--;
12698 }
12699 else if (flags & F_SIGN)
12700 sign = '+';
12701 else if (flags & F_BLANK)
12702 sign = ' ';
12703 else
12704 sign = 0;
12705 }
12706 if (width < len)
12707 width = len;
12708 if (rescnt - (sign != 0) < width) {
12709 reslen -= rescnt;
12710 rescnt = width + fmtcnt + 100;
12711 reslen += rescnt;
12712 if (reslen < 0) {
12713 Py_XDECREF(temp);
12714 PyErr_NoMemory();
12715 goto onError;
12716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12718 if (res0 == 0) {
12719 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 Py_XDECREF(temp);
12721 goto onError;
12722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 }
12725 if (sign) {
12726 if (fill != ' ')
12727 *res++ = sign;
12728 rescnt--;
12729 if (width > len)
12730 width--;
12731 }
12732 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12734 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12737 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 }
12739 rescnt -= 2;
12740 width -= 2;
12741 if (width < 0)
12742 width = 0;
12743 len -= 2;
12744 }
12745 if (width > len && !(flags & F_LJUST)) {
12746 do {
12747 --rescnt;
12748 *res++ = fill;
12749 } while (--width > len);
12750 }
12751 if (fill == ' ') {
12752 if (sign)
12753 *res++ = sign;
12754 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12756 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12757 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12758 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012759 }
12760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 /* Copy all characters, preserving len */
12762 len1 = len;
12763 while (len1--) {
12764 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12765 rescnt--;
12766 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 while (--width >= len) {
12768 --rescnt;
12769 *res++ = ' ';
12770 }
12771 if (dict && (argidx < arglen) && c != '%') {
12772 PyErr_SetString(PyExc_TypeError,
12773 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012774 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 goto onError;
12776 }
12777 Py_XDECREF(temp);
12778 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779 } /* until end */
12780 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 PyErr_SetString(PyExc_TypeError,
12782 "not all arguments converted during string formatting");
12783 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784 }
12785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786
12787 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12788 if (*res > max)
12789 max = *res;
12790 result = PyUnicode_New(reslen - rescnt, max);
12791 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 kind = PyUnicode_KIND(result);
12794 for (res = res0; res < res0+reslen-rescnt; res++)
12795 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12796 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799 }
12800 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801 return (PyObject *)result;
12802
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805 Py_DECREF(uformat);
12806 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808 }
12809 return NULL;
12810}
12811
Jeremy Hylton938ace62002-07-17 16:30:39 +000012812static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012813unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12814
Tim Peters6d6c1a32001-08-02 04:15:00 +000012815static PyObject *
12816unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12817{
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012819 static char *kwlist[] = {"object", "encoding", "errors", 0};
12820 char *encoding = NULL;
12821 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012822
Benjamin Peterson14339b62009-01-31 16:36:08 +000012823 if (type != &PyUnicode_Type)
12824 return unicode_subtype_new(type, args, kwds);
12825 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 return NULL;
12828 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 if (encoding == NULL && errors == NULL)
12831 return PyObject_Str(x);
12832 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012834}
12835
Guido van Rossume023fe02001-08-30 03:12:59 +000012836static PyObject *
12837unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12838{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012839 PyUnicodeObject *unicode, *self;
12840 Py_ssize_t length, char_size;
12841 int share_wstr, share_utf8;
12842 unsigned int kind;
12843 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012844
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012846
12847 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12848 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012849 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012850 assert(_PyUnicode_CHECK(unicode));
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020012851 if (_PyUnicode_READY_REPLACE(&unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012852 return NULL;
12853
12854 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12855 if (self == NULL) {
12856 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012857 return NULL;
12858 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012859 kind = PyUnicode_KIND(unicode);
12860 length = PyUnicode_GET_LENGTH(unicode);
12861
12862 _PyUnicode_LENGTH(self) = length;
12863 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12864 _PyUnicode_STATE(self).interned = 0;
12865 _PyUnicode_STATE(self).kind = kind;
12866 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012867 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012868 _PyUnicode_STATE(self).ready = 1;
12869 _PyUnicode_WSTR(self) = NULL;
12870 _PyUnicode_UTF8_LENGTH(self) = 0;
12871 _PyUnicode_UTF8(self) = NULL;
12872 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012873 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012874
12875 share_utf8 = 0;
12876 share_wstr = 0;
12877 if (kind == PyUnicode_1BYTE_KIND) {
12878 char_size = 1;
12879 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12880 share_utf8 = 1;
12881 }
12882 else if (kind == PyUnicode_2BYTE_KIND) {
12883 char_size = 2;
12884 if (sizeof(wchar_t) == 2)
12885 share_wstr = 1;
12886 }
12887 else {
12888 assert(kind == PyUnicode_4BYTE_KIND);
12889 char_size = 4;
12890 if (sizeof(wchar_t) == 4)
12891 share_wstr = 1;
12892 }
12893
12894 /* Ensure we won't overflow the length. */
12895 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12896 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012898 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012899 data = PyObject_MALLOC((length + 1) * char_size);
12900 if (data == NULL) {
12901 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902 goto onError;
12903 }
12904
Victor Stinnerc3c74152011-10-02 20:39:55 +020012905 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012906 if (share_utf8) {
12907 _PyUnicode_UTF8_LENGTH(self) = length;
12908 _PyUnicode_UTF8(self) = data;
12909 }
12910 if (share_wstr) {
12911 _PyUnicode_WSTR_LENGTH(self) = length;
12912 _PyUnicode_WSTR(self) = (wchar_t *)data;
12913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012915 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12916 PyUnicode_KIND_SIZE(kind, length + 1));
12917 Py_DECREF(unicode);
12918 return (PyObject *)self;
12919
12920onError:
12921 Py_DECREF(unicode);
12922 Py_DECREF(self);
12923 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012924}
12925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012926PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012928\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012929Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012930encoding defaults to the current default string encoding.\n\
12931errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012932
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012933static PyObject *unicode_iter(PyObject *seq);
12934
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012936 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012937 "str", /* tp_name */
12938 sizeof(PyUnicodeObject), /* tp_size */
12939 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012940 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012941 (destructor)unicode_dealloc, /* tp_dealloc */
12942 0, /* tp_print */
12943 0, /* tp_getattr */
12944 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012945 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012946 unicode_repr, /* tp_repr */
12947 &unicode_as_number, /* tp_as_number */
12948 &unicode_as_sequence, /* tp_as_sequence */
12949 &unicode_as_mapping, /* tp_as_mapping */
12950 (hashfunc) unicode_hash, /* tp_hash*/
12951 0, /* tp_call*/
12952 (reprfunc) unicode_str, /* tp_str */
12953 PyObject_GenericGetAttr, /* tp_getattro */
12954 0, /* tp_setattro */
12955 0, /* tp_as_buffer */
12956 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012958 unicode_doc, /* tp_doc */
12959 0, /* tp_traverse */
12960 0, /* tp_clear */
12961 PyUnicode_RichCompare, /* tp_richcompare */
12962 0, /* tp_weaklistoffset */
12963 unicode_iter, /* tp_iter */
12964 0, /* tp_iternext */
12965 unicode_methods, /* tp_methods */
12966 0, /* tp_members */
12967 0, /* tp_getset */
12968 &PyBaseObject_Type, /* tp_base */
12969 0, /* tp_dict */
12970 0, /* tp_descr_get */
12971 0, /* tp_descr_set */
12972 0, /* tp_dictoffset */
12973 0, /* tp_init */
12974 0, /* tp_alloc */
12975 unicode_new, /* tp_new */
12976 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977};
12978
12979/* Initialize the Unicode implementation */
12980
Thomas Wouters78890102000-07-22 19:25:51 +000012981void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012983 int i;
12984
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987 0x000A, /* LINE FEED */
12988 0x000D, /* CARRIAGE RETURN */
12989 0x001C, /* FILE SEPARATOR */
12990 0x001D, /* GROUP SEPARATOR */
12991 0x001E, /* RECORD SEPARATOR */
12992 0x0085, /* NEXT LINE */
12993 0x2028, /* LINE SEPARATOR */
12994 0x2029, /* PARAGRAPH SEPARATOR */
12995 };
12996
Fred Drakee4315f52000-05-09 19:53:39 +000012997 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012998 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012999 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013002 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013004 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013006
13007 /* initialize the linebreak bloom filter */
13008 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013010 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013011
13012 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013}
13014
13015/* Finalize the Unicode implementation */
13016
Christian Heimesa156e092008-02-16 07:38:31 +000013017int
13018PyUnicode_ClearFreeList(void)
13019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013021}
13022
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023void
Thomas Wouters78890102000-07-22 19:25:51 +000013024_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013026 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013028 Py_XDECREF(unicode_empty);
13029 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013030
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013031 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013032 if (unicode_latin1[i]) {
13033 Py_DECREF(unicode_latin1[i]);
13034 unicode_latin1[i] = NULL;
13035 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013036 }
Christian Heimesa156e092008-02-16 07:38:31 +000013037 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013039
Walter Dörwald16807132007-05-25 13:52:07 +000013040void
13041PyUnicode_InternInPlace(PyObject **p)
13042{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013043 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13044 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013045#ifdef Py_DEBUG
13046 assert(s != NULL);
13047 assert(_PyUnicode_CHECK(s));
13048#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013049 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013050 return;
13051#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013052 /* If it's a subclass, we don't really know what putting
13053 it in the interned dict might do. */
13054 if (!PyUnicode_CheckExact(s))
13055 return;
13056 if (PyUnicode_CHECK_INTERNED(s))
13057 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013058 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020013059 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 return;
13061 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013062 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 if (interned == NULL) {
13064 interned = PyDict_New();
13065 if (interned == NULL) {
13066 PyErr_Clear(); /* Don't leave an exception */
13067 return;
13068 }
13069 }
13070 /* It might be that the GetItem call fails even
13071 though the key is present in the dictionary,
13072 namely when this happens during a stack overflow. */
13073 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013075 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013076
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 if (t) {
13078 Py_INCREF(t);
13079 Py_DECREF(*p);
13080 *p = t;
13081 return;
13082 }
Walter Dörwald16807132007-05-25 13:52:07 +000013083
Benjamin Peterson14339b62009-01-31 16:36:08 +000013084 PyThreadState_GET()->recursion_critical = 1;
13085 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13086 PyErr_Clear();
13087 PyThreadState_GET()->recursion_critical = 0;
13088 return;
13089 }
13090 PyThreadState_GET()->recursion_critical = 0;
13091 /* The two references in interned are not counted by refcnt.
13092 The deallocator will take care of this */
13093 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013095}
13096
13097void
13098PyUnicode_InternImmortal(PyObject **p)
13099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13101
Benjamin Peterson14339b62009-01-31 16:36:08 +000013102 PyUnicode_InternInPlace(p);
13103 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 Py_INCREF(*p);
13106 }
Walter Dörwald16807132007-05-25 13:52:07 +000013107}
13108
13109PyObject *
13110PyUnicode_InternFromString(const char *cp)
13111{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013112 PyObject *s = PyUnicode_FromString(cp);
13113 if (s == NULL)
13114 return NULL;
13115 PyUnicode_InternInPlace(&s);
13116 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013117}
13118
Alexander Belopolsky40018472011-02-26 01:02:56 +000013119void
13120_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 PyObject *keys;
13123 PyUnicodeObject *s;
13124 Py_ssize_t i, n;
13125 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013126
Benjamin Peterson14339b62009-01-31 16:36:08 +000013127 if (interned == NULL || !PyDict_Check(interned))
13128 return;
13129 keys = PyDict_Keys(interned);
13130 if (keys == NULL || !PyList_Check(keys)) {
13131 PyErr_Clear();
13132 return;
13133 }
Walter Dörwald16807132007-05-25 13:52:07 +000013134
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13136 detector, interned unicode strings are not forcibly deallocated;
13137 rather, we give them their stolen references back, and then clear
13138 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013139
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 n = PyList_GET_SIZE(keys);
13141 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013143 for (i = 0; i < n; i++) {
13144 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 if (PyUnicode_READY(s) == -1)
13146 fprintf(stderr, "could not ready string\n");
13147 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013148 case SSTATE_NOT_INTERNED:
13149 /* XXX Shouldn't happen */
13150 break;
13151 case SSTATE_INTERNED_IMMORTAL:
13152 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 break;
13155 case SSTATE_INTERNED_MORTAL:
13156 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 break;
13159 default:
13160 Py_FatalError("Inconsistent interned string state.");
13161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 }
13164 fprintf(stderr, "total size of all interned strings: "
13165 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13166 "mortal/immortal\n", mortal_size, immortal_size);
13167 Py_DECREF(keys);
13168 PyDict_Clear(interned);
13169 Py_DECREF(interned);
13170 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013171}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013172
13173
13174/********************* Unicode Iterator **************************/
13175
13176typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013177 PyObject_HEAD
13178 Py_ssize_t it_index;
13179 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013180} unicodeiterobject;
13181
13182static void
13183unicodeiter_dealloc(unicodeiterobject *it)
13184{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013185 _PyObject_GC_UNTRACK(it);
13186 Py_XDECREF(it->it_seq);
13187 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013188}
13189
13190static int
13191unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 Py_VISIT(it->it_seq);
13194 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013195}
13196
13197static PyObject *
13198unicodeiter_next(unicodeiterobject *it)
13199{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 PyUnicodeObject *seq;
13201 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013202
Benjamin Peterson14339b62009-01-31 16:36:08 +000013203 assert(it != NULL);
13204 seq = it->it_seq;
13205 if (seq == NULL)
13206 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013207 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13210 int kind = PyUnicode_KIND(seq);
13211 void *data = PyUnicode_DATA(seq);
13212 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13213 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013214 if (item != NULL)
13215 ++it->it_index;
13216 return item;
13217 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013218
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 Py_DECREF(seq);
13220 it->it_seq = NULL;
13221 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013222}
13223
13224static PyObject *
13225unicodeiter_len(unicodeiterobject *it)
13226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013227 Py_ssize_t len = 0;
13228 if (it->it_seq)
13229 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13230 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013231}
13232
13233PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13234
13235static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013238 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013239};
13240
13241PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013242 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13243 "str_iterator", /* tp_name */
13244 sizeof(unicodeiterobject), /* tp_basicsize */
13245 0, /* tp_itemsize */
13246 /* methods */
13247 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13248 0, /* tp_print */
13249 0, /* tp_getattr */
13250 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013251 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252 0, /* tp_repr */
13253 0, /* tp_as_number */
13254 0, /* tp_as_sequence */
13255 0, /* tp_as_mapping */
13256 0, /* tp_hash */
13257 0, /* tp_call */
13258 0, /* tp_str */
13259 PyObject_GenericGetAttr, /* tp_getattro */
13260 0, /* tp_setattro */
13261 0, /* tp_as_buffer */
13262 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13263 0, /* tp_doc */
13264 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13265 0, /* tp_clear */
13266 0, /* tp_richcompare */
13267 0, /* tp_weaklistoffset */
13268 PyObject_SelfIter, /* tp_iter */
13269 (iternextfunc)unicodeiter_next, /* tp_iternext */
13270 unicodeiter_methods, /* tp_methods */
13271 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013272};
13273
13274static PyObject *
13275unicode_iter(PyObject *seq)
13276{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013278
Benjamin Peterson14339b62009-01-31 16:36:08 +000013279 if (!PyUnicode_Check(seq)) {
13280 PyErr_BadInternalCall();
13281 return NULL;
13282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013283 if (PyUnicode_READY(seq) == -1)
13284 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013285 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13286 if (it == NULL)
13287 return NULL;
13288 it->it_index = 0;
13289 Py_INCREF(seq);
13290 it->it_seq = (PyUnicodeObject *)seq;
13291 _PyObject_GC_TRACK(it);
13292 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013293}
13294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295#define UNIOP(x) Py_UNICODE_##x
13296#define UNIOP_t Py_UNICODE
13297#include "uniops.h"
13298#undef UNIOP
13299#undef UNIOP_t
13300#define UNIOP(x) Py_UCS4_##x
13301#define UNIOP_t Py_UCS4
13302#include "uniops.h"
13303#undef UNIOP
13304#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013305
Victor Stinner71133ff2010-09-01 23:43:53 +000013306Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013307PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013308{
13309 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13310 Py_UNICODE *copy;
13311 Py_ssize_t size;
13312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 if (!PyUnicode_Check(unicode)) {
13314 PyErr_BadArgument();
13315 return NULL;
13316 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013317 /* Ensure we won't overflow the size. */
13318 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13319 PyErr_NoMemory();
13320 return NULL;
13321 }
13322 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13323 size *= sizeof(Py_UNICODE);
13324 copy = PyMem_Malloc(size);
13325 if (copy == NULL) {
13326 PyErr_NoMemory();
13327 return NULL;
13328 }
13329 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13330 return copy;
13331}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013332
Georg Brandl66c221e2010-10-14 07:04:07 +000013333/* A _string module, to export formatter_parser and formatter_field_name_split
13334 to the string.Formatter class implemented in Python. */
13335
13336static PyMethodDef _string_methods[] = {
13337 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13338 METH_O, PyDoc_STR("split the argument as a field name")},
13339 {"formatter_parser", (PyCFunction) formatter_parser,
13340 METH_O, PyDoc_STR("parse the argument as a format string")},
13341 {NULL, NULL}
13342};
13343
13344static struct PyModuleDef _string_module = {
13345 PyModuleDef_HEAD_INIT,
13346 "_string",
13347 PyDoc_STR("string helper module"),
13348 0,
13349 _string_methods,
13350 NULL,
13351 NULL,
13352 NULL,
13353 NULL
13354};
13355
13356PyMODINIT_FUNC
13357PyInit__string(void)
13358{
13359 return PyModule_Create(&_string_module);
13360}
13361
13362
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013363#ifdef __cplusplus
13364}
13365#endif