blob: a0d3056b7f7a9e04ac7311a1388b88ad6d5a2dc8 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
244unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000245 PyObject **errorHandler,const char *encoding, const char *reason,
246 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
247 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
248
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249static void
250raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300251 const char *encoding,
252 const Py_UNICODE *unicode, Py_ssize_t size,
253 Py_ssize_t startpos, Py_ssize_t endpos,
254 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000255
Christian Heimes190d79e2008-01-30 11:58:22 +0000256/* Same for linebreaks */
257static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000259/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000260/* 0x000B, * LINE TABULATION */
261/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000262/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000263 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000265/* 0x001C, * FILE SEPARATOR */
266/* 0x001D, * GROUP SEPARATOR */
267/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 1, 1, 1, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000282};
283
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300284/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
285 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000287PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000289#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 /* This is actually an illegal character, so it should
293 not be passed to unichr. */
294 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#endif
296}
297
Victor Stinner910337b2011-10-03 03:20:16 +0200298#ifdef Py_DEBUG
299static int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200300/* FIXME: use PyObject* type for op */
301_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
327 } else {
328 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
329
330 data = unicode->data.any;
331 if (kind == PyUnicode_WCHAR_KIND) {
332 assert(ascii->state.compact == 0);
333 assert(ascii->state.ascii == 0);
334 assert(ascii->state.ready == 0);
335 assert(ascii->wstr != NULL);
336 assert(data == NULL);
337 assert(compact->utf8 == NULL);
338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
339 }
340 else {
341 assert(kind == PyUnicode_1BYTE_KIND
342 || kind == PyUnicode_2BYTE_KIND
343 || kind == PyUnicode_4BYTE_KIND);
344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ready == 1);
346 assert(data != NULL);
347 if (ascii->state.ascii) {
348 assert (compact->utf8 == data);
349 assert (compact->utf8_length == ascii->length);
350 }
351 else
352 assert (compact->utf8 != data);
353 }
354 }
355 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200356 if (
357#if SIZEOF_WCHAR_T == 2
358 kind == PyUnicode_2BYTE_KIND
359#else
360 kind == PyUnicode_4BYTE_KIND
361#endif
362 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200363 {
364 assert(ascii->wstr == data);
365 assert(compact->wstr_length == ascii->length);
366 } else
367 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200368 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200369
370 if (compact->utf8 == NULL)
371 assert(compact->utf8_length == 0);
372 if (ascii->wstr == NULL)
373 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200375 /* check that the best kind is used */
376 if (check_content && kind != PyUnicode_WCHAR_KIND)
377 {
378 Py_ssize_t i;
379 Py_UCS4 maxchar = 0;
380 void *data = PyUnicode_DATA(ascii);
381 for (i=0; i < ascii->length; i++)
382 {
383 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
388 if (ascii->state.ascii == 0)
389 assert(maxchar >= 128);
390 else
391 assert(maxchar < 128);
392 }
393 else if (kind == PyUnicode_2BYTE_KIND)
394 assert(maxchar >= 0x100);
395 else
396 assert(maxchar >= 0x10000);
397 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400398 return 1;
399}
Victor Stinner910337b2011-10-03 03:20:16 +0200400#endif
401
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402/* --- Bloom Filters ----------------------------------------------------- */
403
404/* stuff to implement simple "bloom filters" for Unicode characters.
405 to keep things simple, we use a single bitmask, using the least 5
406 bits from each unicode characters as the bit index. */
407
408/* the linebreak mask is set up by Unicode_Init below */
409
Antoine Pitrouf068f942010-01-13 14:19:12 +0000410#if LONG_BIT >= 128
411#define BLOOM_WIDTH 128
412#elif LONG_BIT >= 64
413#define BLOOM_WIDTH 64
414#elif LONG_BIT >= 32
415#define BLOOM_WIDTH 32
416#else
417#error "LONG_BIT is smaller than 32"
418#endif
419
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420#define BLOOM_MASK unsigned long
421
422static BLOOM_MASK bloom_linebreak;
423
Antoine Pitrouf068f942010-01-13 14:19:12 +0000424#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
425#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000426
Benjamin Peterson29060642009-01-31 22:14:21 +0000427#define BLOOM_LINEBREAK(ch) \
428 ((ch) < 128U ? ascii_linebreak[(ch)] : \
429 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000430
Alexander Belopolsky40018472011-02-26 01:02:56 +0000431Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000433{
434 /* calculate simple bloom-style bitmask for a given unicode string */
435
Antoine Pitrouf068f942010-01-13 14:19:12 +0000436 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437 Py_ssize_t i;
438
439 mask = 0;
440 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442
443 return mask;
444}
445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446#define BLOOM_MEMBER(mask, chr, str) \
447 (BLOOM(mask, chr) \
448 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450/* --- Unicode Object ----------------------------------------------------- */
451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200453fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
455Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
456 Py_ssize_t size, Py_UCS4 ch,
457 int direction)
458{
459 /* like wcschr, but doesn't stop at NULL characters */
460 Py_ssize_t i;
461 if (direction == 1) {
462 for(i = 0; i < size; i++)
463 if (PyUnicode_READ(kind, s, i) == ch)
464 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
465 }
466 else {
467 for(i = size-1; i >= 0; i--)
468 if (PyUnicode_READ(kind, s, i) == ch)
469 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
470 }
471 return NULL;
472}
473
Victor Stinnerfe226c02011-10-03 03:52:20 +0200474static PyObject*
475resize_compact(PyObject *unicode, Py_ssize_t length)
476{
477 Py_ssize_t char_size;
478 Py_ssize_t struct_size;
479 Py_ssize_t new_size;
480 int share_wstr;
481
482 assert(PyUnicode_IS_READY(unicode));
483 char_size = PyUnicode_CHARACTER_SIZE(unicode);
484 if (PyUnicode_IS_COMPACT_ASCII(unicode))
485 struct_size = sizeof(PyASCIIObject);
486 else
487 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200489
490 _Py_DEC_REFTOTAL;
491 _Py_ForgetReference(unicode);
492
493 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
494 PyErr_NoMemory();
495 return NULL;
496 }
497 new_size = (struct_size + (length + 1) * char_size);
498
499 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
500 if (unicode == NULL) {
501 PyObject_Del(unicode);
502 PyErr_NoMemory();
503 return NULL;
504 }
505 _Py_NewReference(unicode);
506 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200507 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200509 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
510 _PyUnicode_WSTR_LENGTH(unicode) = length;
511 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
513 length, 0);
514 return unicode;
515}
516
Alexander Belopolsky40018472011-02-26 01:02:56 +0000517static int
Victor Stinner95663112011-10-04 01:03:50 +0200518resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519{
Victor Stinner95663112011-10-04 01:03:50 +0200520 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200522 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000523
Victor Stinner95663112011-10-04 01:03:50 +0200524 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200525
526 if (PyUnicode_IS_READY(unicode)) {
527 Py_ssize_t char_size;
528 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200529 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200530 void *data;
531
532 data = _PyUnicode_DATA_ANY(unicode);
533 assert(data != NULL);
534 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200535 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
536 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200537 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
538 {
539 PyObject_DEL(_PyUnicode_UTF8(unicode));
540 _PyUnicode_UTF8(unicode) = NULL;
541 _PyUnicode_UTF8_LENGTH(unicode) = 0;
542 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200543
544 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
545 PyErr_NoMemory();
546 return -1;
547 }
548 new_size = (length + 1) * char_size;
549
550 data = (PyObject *)PyObject_REALLOC(data, new_size);
551 if (data == NULL) {
552 PyErr_NoMemory();
553 return -1;
554 }
555 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200556 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200557 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200558 _PyUnicode_WSTR_LENGTH(unicode) = length;
559 }
560 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200561 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200562 _PyUnicode_UTF8_LENGTH(unicode) = length;
563 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564 _PyUnicode_LENGTH(unicode) = length;
565 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200566 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200568 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200570 }
Victor Stinner95663112011-10-04 01:03:50 +0200571 assert(_PyUnicode_WSTR(unicode) != NULL);
572
573 /* check for integer overflow */
574 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
575 PyErr_NoMemory();
576 return -1;
577 }
578 wstr = _PyUnicode_WSTR(unicode);
579 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
580 if (!wstr) {
581 PyErr_NoMemory();
582 return -1;
583 }
584 _PyUnicode_WSTR(unicode) = wstr;
585 _PyUnicode_WSTR(unicode)[length] = 0;
586 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 return 0;
589}
590
Victor Stinnerfe226c02011-10-03 03:52:20 +0200591static PyObject*
592resize_copy(PyObject *unicode, Py_ssize_t length)
593{
594 Py_ssize_t copy_length;
595 if (PyUnicode_IS_COMPACT(unicode)) {
596 PyObject *copy;
597 assert(PyUnicode_IS_READY(unicode));
598
599 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
600 if (copy == NULL)
601 return NULL;
602
603 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
604 if (PyUnicode_CopyCharacters(copy, 0,
605 unicode, 0,
606 copy_length) < 0)
607 {
608 Py_DECREF(copy);
609 return NULL;
610 }
611 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200612 }
613 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200614 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(_PyUnicode_WSTR(unicode) != NULL);
616 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 if (w == NULL)
619 return NULL;
620 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
621 copy_length = Py_MIN(copy_length, length);
622 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
623 copy_length);
624 return (PyObject*)w;
625 }
626}
627
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000629 Ux0000 terminated; some code (e.g. new_identifier)
630 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631
632 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000633 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635*/
636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637#ifdef Py_DEBUG
638int unicode_old_new_calls = 0;
639#endif
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641static PyUnicodeObject *
642_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646
Thomas Wouters477c8d52006-05-27 19:21:47 +0000647 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 if (length == 0 && unicode_empty != NULL) {
649 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200650 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000653 /* Ensure we won't overflow the size. */
654 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
655 return (PyUnicodeObject *)PyErr_NoMemory();
656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 if (length < 0) {
658 PyErr_SetString(PyExc_SystemError,
659 "Negative size passed to _PyUnicode_New");
660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifdef Py_DEBUG
664 ++unicode_old_new_calls;
665#endif
666
667 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
668 if (unicode == NULL)
669 return NULL;
670 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
671 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
672 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 PyErr_NoMemory();
674 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676
Jeremy Hyltond8082792003-09-16 19:41:39 +0000677 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000678 * the caller fails before initializing str -- unicode_resize()
679 * reads str[0], and the Keep-Alive optimization can keep memory
680 * allocated for str alive across a call to unicode_dealloc(unicode).
681 * We don't want unicode_resize to read uninitialized memory in
682 * that case.
683 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 _PyUnicode_WSTR(unicode)[0] = 0;
685 _PyUnicode_WSTR(unicode)[length] = 0;
686 _PyUnicode_WSTR_LENGTH(unicode) = length;
687 _PyUnicode_HASH(unicode) = -1;
688 _PyUnicode_STATE(unicode).interned = 0;
689 _PyUnicode_STATE(unicode).kind = 0;
690 _PyUnicode_STATE(unicode).compact = 0;
691 _PyUnicode_STATE(unicode).ready = 0;
692 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200693 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200694 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200695 _PyUnicode_UTF8(unicode) = NULL;
696 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000698
Benjamin Peterson29060642009-01-31 22:14:21 +0000699 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000700 /* XXX UNREF/NEWREF interface should be more symmetrical */
701 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000702 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000703 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705}
706
Victor Stinnerf42dc442011-10-02 23:33:16 +0200707static const char*
708unicode_kind_name(PyObject *unicode)
709{
Victor Stinner42dfd712011-10-03 14:41:45 +0200710 /* don't check consistency: unicode_kind_name() is called from
711 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200712 if (!PyUnicode_IS_COMPACT(unicode))
713 {
714 if (!PyUnicode_IS_READY(unicode))
715 return "wstr";
716 switch(PyUnicode_KIND(unicode))
717 {
718 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200719 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200720 return "legacy ascii";
721 else
722 return "legacy latin1";
723 case PyUnicode_2BYTE_KIND:
724 return "legacy UCS2";
725 case PyUnicode_4BYTE_KIND:
726 return "legacy UCS4";
727 default:
728 return "<legacy invalid kind>";
729 }
730 }
731 assert(PyUnicode_IS_READY(unicode));
732 switch(PyUnicode_KIND(unicode))
733 {
734 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200735 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200736 return "ascii";
737 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200740 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200741 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200742 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200743 default:
744 return "<invalid compact kind>";
745 }
746}
747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748#ifdef Py_DEBUG
749int unicode_new_new_calls = 0;
750
751/* Functions wrapping macros for use in debugger */
752char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200753 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754}
755
756void *_PyUnicode_compact_data(void *unicode) {
757 return _PyUnicode_COMPACT_DATA(unicode);
758}
759void *_PyUnicode_data(void *unicode){
760 printf("obj %p\n", unicode);
761 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
762 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
763 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
764 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
765 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
766 return PyUnicode_DATA(unicode);
767}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200768
769void
770_PyUnicode_Dump(PyObject *op)
771{
772 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200773 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
774 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
775 void *data;
776 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
777 if (ascii->state.compact)
778 data = (compact + 1);
779 else
780 data = unicode->data.any;
781 if (ascii->wstr == data)
782 printf("shared ");
783 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200784 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200785 printf(" (%zu), ", compact->wstr_length);
786 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
787 printf("shared ");
788 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200789 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200790 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200791}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200792#endif
793
794PyObject *
795PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
796{
797 PyObject *obj;
798 PyCompactUnicodeObject *unicode;
799 void *data;
800 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200801 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 Py_ssize_t char_size;
803 Py_ssize_t struct_size;
804
805 /* Optimization for empty strings */
806 if (size == 0 && unicode_empty != NULL) {
807 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200808 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 }
810
811#ifdef Py_DEBUG
812 ++unicode_new_new_calls;
813#endif
814
Victor Stinner9e9d6892011-10-04 01:02:02 +0200815 is_ascii = 0;
816 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 struct_size = sizeof(PyCompactUnicodeObject);
818 if (maxchar < 128) {
819 kind_state = PyUnicode_1BYTE_KIND;
820 char_size = 1;
821 is_ascii = 1;
822 struct_size = sizeof(PyASCIIObject);
823 }
824 else if (maxchar < 256) {
825 kind_state = PyUnicode_1BYTE_KIND;
826 char_size = 1;
827 }
828 else if (maxchar < 65536) {
829 kind_state = PyUnicode_2BYTE_KIND;
830 char_size = 2;
831 if (sizeof(wchar_t) == 2)
832 is_sharing = 1;
833 }
834 else {
835 kind_state = PyUnicode_4BYTE_KIND;
836 char_size = 4;
837 if (sizeof(wchar_t) == 4)
838 is_sharing = 1;
839 }
840
841 /* Ensure we won't overflow the size. */
842 if (size < 0) {
843 PyErr_SetString(PyExc_SystemError,
844 "Negative size passed to PyUnicode_New");
845 return NULL;
846 }
847 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
848 return PyErr_NoMemory();
849
850 /* Duplicated allocation code from _PyObject_New() instead of a call to
851 * PyObject_New() so we are able to allocate space for the object and
852 * it's data buffer.
853 */
854 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
855 if (obj == NULL)
856 return PyErr_NoMemory();
857 obj = PyObject_INIT(obj, &PyUnicode_Type);
858 if (obj == NULL)
859 return NULL;
860
861 unicode = (PyCompactUnicodeObject *)obj;
862 if (is_ascii)
863 data = ((PyASCIIObject*)obj) + 1;
864 else
865 data = unicode + 1;
866 _PyUnicode_LENGTH(unicode) = size;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = kind_state;
870 _PyUnicode_STATE(unicode).compact = 1;
871 _PyUnicode_STATE(unicode).ready = 1;
872 _PyUnicode_STATE(unicode).ascii = is_ascii;
873 if (is_ascii) {
874 ((char*)data)[size] = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 else if (kind_state == PyUnicode_1BYTE_KIND) {
878 ((char*)data)[size] = 0;
879 _PyUnicode_WSTR(unicode) = NULL;
880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200882 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 }
884 else {
885 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200886 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 if (kind_state == PyUnicode_2BYTE_KIND)
888 ((Py_UCS2*)data)[size] = 0;
889 else /* kind_state == PyUnicode_4BYTE_KIND */
890 ((Py_UCS4*)data)[size] = 0;
891 if (is_sharing) {
892 _PyUnicode_WSTR_LENGTH(unicode) = size;
893 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
894 }
895 else {
896 _PyUnicode_WSTR_LENGTH(unicode) = 0;
897 _PyUnicode_WSTR(unicode) = NULL;
898 }
899 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200900 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 return obj;
902}
903
904#if SIZEOF_WCHAR_T == 2
905/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
906 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200907 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909 This function assumes that unicode can hold one more code point than wstr
910 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200911static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
913 PyUnicodeObject *unicode)
914{
915 const wchar_t *iter;
916 Py_UCS4 *ucs4_out;
917
Victor Stinner910337b2011-10-03 03:20:16 +0200918 assert(unicode != NULL);
919 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
921 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
922
923 for (iter = begin; iter < end; ) {
924 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
925 _PyUnicode_GET_LENGTH(unicode)));
926 if (*iter >= 0xD800 && *iter <= 0xDBFF
927 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
928 {
929 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
930 iter += 2;
931 }
932 else {
933 *ucs4_out++ = *iter;
934 iter++;
935 }
936 }
937 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
938 _PyUnicode_GET_LENGTH(unicode)));
939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940}
941#endif
942
Victor Stinnercd9950f2011-10-02 00:34:53 +0200943static int
944_PyUnicode_Dirty(PyObject *unicode)
945{
Victor Stinner910337b2011-10-03 03:20:16 +0200946 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200947 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200948 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949 "Cannot modify a string having more than 1 reference");
950 return -1;
951 }
952 _PyUnicode_DIRTY(unicode);
953 return 0;
954}
955
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200956Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
958 PyObject *from, Py_ssize_t from_start,
959 Py_ssize_t how_many)
960{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 unsigned int from_kind, to_kind;
962 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963
Victor Stinnerb1536152011-09-30 02:26:10 +0200964 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
965 PyErr_BadInternalCall();
966 return -1;
967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968
969 if (PyUnicode_READY(from))
970 return -1;
971 if (PyUnicode_READY(to))
972 return -1;
973
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200974 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200976 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 "Cannot write %zi characters at %zi "
978 "in a string of %zi characters",
979 how_many, to_start, PyUnicode_GET_LENGTH(to));
980 return -1;
981 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200982 if (how_many == 0)
983 return 0;
984
Victor Stinnercd9950f2011-10-02 00:34:53 +0200985 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200986 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200989 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200991 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992
Victor Stinnerf42dc442011-10-02 23:33:16 +0200993 if (from_kind == to_kind
994 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200995 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200996 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200997 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200998 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200999 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001000 + PyUnicode_KIND_SIZE(from_kind, from_start),
1001 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001003 else if (from_kind == PyUnicode_1BYTE_KIND
1004 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001005 {
1006 _PyUnicode_CONVERT_BYTES(
1007 Py_UCS1, Py_UCS2,
1008 PyUnicode_1BYTE_DATA(from) + from_start,
1009 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1010 PyUnicode_2BYTE_DATA(to) + to_start
1011 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001012 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001013 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001014 && to_kind == PyUnicode_4BYTE_KIND)
1015 {
1016 _PyUnicode_CONVERT_BYTES(
1017 Py_UCS1, Py_UCS4,
1018 PyUnicode_1BYTE_DATA(from) + from_start,
1019 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1020 PyUnicode_4BYTE_DATA(to) + to_start
1021 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001022 }
1023 else if (from_kind == PyUnicode_2BYTE_KIND
1024 && to_kind == PyUnicode_4BYTE_KIND)
1025 {
1026 _PyUnicode_CONVERT_BYTES(
1027 Py_UCS2, Py_UCS4,
1028 PyUnicode_2BYTE_DATA(from) + from_start,
1029 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1030 PyUnicode_4BYTE_DATA(to) + to_start
1031 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001032 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001033 else {
1034 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001035
1036 /* check if max_char(from substring) <= max_char(to) */
1037 if (from_kind > to_kind
1038 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001039 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001040 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001041 /* slow path to check for character overflow */
1042 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1043 Py_UCS4 ch, maxchar;
1044 Py_ssize_t i;
1045
1046 maxchar = 0;
1047 invalid_kinds = 0;
1048 for (i=0; i < how_many; i++) {
1049 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1050 if (ch > maxchar) {
1051 maxchar = ch;
1052 if (maxchar > to_maxchar) {
1053 invalid_kinds = 1;
1054 break;
1055 }
1056 }
1057 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1058 }
1059 }
1060 else
1061 invalid_kinds = 1;
1062 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001063 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001064 "Cannot copy %s characters "
1065 "into a string of %s characters",
1066 unicode_kind_name(from),
1067 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 return -1;
1069 }
1070 }
1071 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072}
1073
Victor Stinner17222162011-09-28 22:15:37 +02001074/* Find the maximum code point and count the number of surrogate pairs so a
1075 correct string length can be computed before converting a string to UCS4.
1076 This function counts single surrogates as a character and not as a pair.
1077
1078 Return 0 on success, or -1 on error. */
1079static int
1080find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1081 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082{
1083 const wchar_t *iter;
1084
Victor Stinnerc53be962011-10-02 21:33:54 +02001085 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 *num_surrogates = 0;
1087 *maxchar = 0;
1088
1089 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001090 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001092#if SIZEOF_WCHAR_T != 2
1093 if (*maxchar >= 0x10000)
1094 return 0;
1095#endif
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097#if SIZEOF_WCHAR_T == 2
1098 if (*iter >= 0xD800 && *iter <= 0xDBFF
1099 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1100 {
1101 Py_UCS4 surrogate_val;
1102 surrogate_val = (((iter[0] & 0x3FF)<<10)
1103 | (iter[1] & 0x3FF)) + 0x10000;
1104 ++(*num_surrogates);
1105 if (surrogate_val > *maxchar)
1106 *maxchar = surrogate_val;
1107 iter += 2;
1108 }
1109 else
1110 iter++;
1111#else
1112 iter++;
1113#endif
1114 }
1115 return 0;
1116}
1117
1118#ifdef Py_DEBUG
1119int unicode_ready_calls = 0;
1120#endif
1121
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001122static int
1123unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001125 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 wchar_t *end;
1127 Py_UCS4 maxchar = 0;
1128 Py_ssize_t num_surrogates;
1129#if SIZEOF_WCHAR_T == 2
1130 Py_ssize_t length_wo_surrogates;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133 assert(p_obj != NULL);
1134 unicode = (PyUnicodeObject *)*p_obj;
1135
Georg Brandl7597add2011-10-05 16:36:47 +02001136 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001137 strings were created using _PyObject_New() and where no canonical
1138 representation (the str field) has been set yet aka strings
1139 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001140 assert(_PyUnicode_CHECK(unicode));
1141 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001143 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001144 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001145 /* Actually, it should neither be interned nor be anything else: */
1146 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148#ifdef Py_DEBUG
1149 ++unicode_ready_calls;
1150#endif
1151
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001152#ifdef Py_DEBUG
1153 assert(!replace || Py_REFCNT(unicode) == 1);
1154#else
1155 if (replace && Py_REFCNT(unicode) != 1)
1156 replace = 0;
1157#endif
1158 if (replace) {
1159 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1160 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1161 /* Optimization for empty strings */
1162 if (len == 0) {
1163 Py_INCREF(unicode_empty);
1164 Py_DECREF(*p_obj);
1165 *p_obj = unicode_empty;
1166 return 0;
1167 }
1168 if (len == 1 && wstr[0] < 256) {
1169 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1170 if (latin1_char == NULL)
1171 return -1;
1172 Py_DECREF(*p_obj);
1173 *p_obj = latin1_char;
1174 return 0;
1175 }
1176 }
1177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001179 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001180 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182
1183 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001184 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1185 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186 PyErr_NoMemory();
1187 return -1;
1188 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001189 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 _PyUnicode_WSTR(unicode), end,
1191 PyUnicode_1BYTE_DATA(unicode));
1192 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1193 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1194 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1195 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001196 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001197 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001198 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 }
1200 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 PyObject_FREE(_PyUnicode_WSTR(unicode));
1206 _PyUnicode_WSTR(unicode) = NULL;
1207 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1208 }
1209 /* In this case we might have to convert down from 4-byte native
1210 wchar_t to 2-byte unicode. */
1211 else if (maxchar < 65536) {
1212 assert(num_surrogates == 0 &&
1213 "FindMaxCharAndNumSurrogatePairs() messed up");
1214
Victor Stinner506f5922011-09-28 22:34:18 +02001215#if SIZEOF_WCHAR_T == 2
1216 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001217 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001218 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1219 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1220 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001223#else
1224 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001225 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001226 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001227 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001228 PyErr_NoMemory();
1229 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 }
Victor Stinner506f5922011-09-28 22:34:18 +02001231 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1232 _PyUnicode_WSTR(unicode), end,
1233 PyUnicode_2BYTE_DATA(unicode));
1234 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1235 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1236 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001237 _PyUnicode_UTF8(unicode) = NULL;
1238 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001239 PyObject_FREE(_PyUnicode_WSTR(unicode));
1240 _PyUnicode_WSTR(unicode) = NULL;
1241 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 }
1244 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1245 else {
1246#if SIZEOF_WCHAR_T == 2
1247 /* in case the native representation is 2-bytes, we need to allocate a
1248 new normalized 4-byte version. */
1249 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001250 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1251 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 PyErr_NoMemory();
1253 return -1;
1254 }
1255 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1256 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001257 _PyUnicode_UTF8(unicode) = NULL;
1258 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001259 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1260 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001261 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 PyObject_FREE(_PyUnicode_WSTR(unicode));
1263 _PyUnicode_WSTR(unicode) = NULL;
1264 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1265#else
1266 assert(num_surrogates == 0);
1267
Victor Stinnerc3c74152011-10-02 20:39:55 +02001268 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001270 _PyUnicode_UTF8(unicode) = NULL;
1271 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1273#endif
1274 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1275 }
1276 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001277 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 return 0;
1279}
1280
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001281int
1282_PyUnicode_ReadyReplace(PyObject **op)
1283{
1284 return unicode_ready(op, 1);
1285}
1286
1287int
1288_PyUnicode_Ready(PyObject *op)
1289{
1290 return unicode_ready(&op, 0);
1291}
1292
Alexander Belopolsky40018472011-02-26 01:02:56 +00001293static void
1294unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
Walter Dörwald16807132007-05-25 13:52:07 +00001296 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 case SSTATE_NOT_INTERNED:
1298 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001299
Benjamin Peterson29060642009-01-31 22:14:21 +00001300 case SSTATE_INTERNED_MORTAL:
1301 /* revive dead object temporarily for DelItem */
1302 Py_REFCNT(unicode) = 3;
1303 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1304 Py_FatalError(
1305 "deletion of interned string failed");
1306 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001307
Benjamin Peterson29060642009-01-31 22:14:21 +00001308 case SSTATE_INTERNED_IMMORTAL:
1309 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001310
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 default:
1312 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001313 }
1314
Victor Stinner03490912011-10-03 23:45:12 +02001315 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001317 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001318 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319
1320 if (PyUnicode_IS_COMPACT(unicode)) {
1321 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 }
1323 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001324 if (_PyUnicode_DATA_ANY(unicode))
1325 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001326 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 }
1328}
1329
Alexander Belopolsky40018472011-02-26 01:02:56 +00001330static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001331unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001332{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001333 if (Py_REFCNT(unicode) != 1)
1334 return 0;
1335 if (PyUnicode_CHECK_INTERNED(unicode))
1336 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001337 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001338#ifdef Py_DEBUG
1339 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1340 && PyUnicode_GET_LENGTH(unicode) == 1)
1341 {
1342 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001343 if (ch < 256 && unicode_latin1[ch] == unicode)
1344 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001345 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001346#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001347 return 1;
1348}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350static int
1351unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1352{
1353 PyObject *unicode;
1354 Py_ssize_t old_length;
1355
1356 assert(p_unicode != NULL);
1357 unicode = *p_unicode;
1358
1359 assert(unicode != NULL);
1360 assert(PyUnicode_Check(unicode));
1361 assert(0 <= length);
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 old_length = PyUnicode_WSTR_LENGTH(unicode);
1365 else
1366 old_length = PyUnicode_GET_LENGTH(unicode);
1367 if (old_length == length)
1368 return 0;
1369
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370 if (!unicode_resizable(unicode)) {
1371 PyObject *copy = resize_copy(unicode, length);
1372 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001374 Py_DECREF(*p_unicode);
1375 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001377 }
1378
Victor Stinnerfe226c02011-10-03 03:52:20 +02001379 if (PyUnicode_IS_COMPACT(unicode)) {
1380 *p_unicode = resize_compact(unicode, length);
1381 if (*p_unicode == NULL)
1382 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001383 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001384 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001385 }
1386 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001387}
1388
Alexander Belopolsky40018472011-02-26 01:02:56 +00001389int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001390PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001391{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001392 PyObject *unicode;
1393 if (p_unicode == NULL) {
1394 PyErr_BadInternalCall();
1395 return -1;
1396 }
1397 unicode = *p_unicode;
1398 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1399 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1400 {
1401 PyErr_BadInternalCall();
1402 return -1;
1403 }
1404 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001405}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407static PyObject*
1408get_latin1_char(unsigned char ch)
1409{
Victor Stinnera464fc12011-10-02 20:39:30 +02001410 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001412 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 if (!unicode)
1414 return NULL;
1415 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001416 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 unicode_latin1[ch] = unicode;
1418 }
1419 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001420 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421}
1422
Alexander Belopolsky40018472011-02-26 01:02:56 +00001423PyObject *
1424PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425{
1426 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 Py_UCS4 maxchar = 0;
1428 Py_ssize_t num_surrogates;
1429
1430 if (u == NULL)
1431 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001433 /* If the Unicode data is known at construction time, we can apply
1434 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 /* Optimization for empty strings */
1437 if (size == 0 && unicode_empty != NULL) {
1438 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001439 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001440 }
Tim Petersced69f82003-09-16 20:30:58 +00001441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 /* Single character Unicode objects in the Latin-1 range are
1443 shared when using this constructor */
1444 if (size == 1 && *u < 256)
1445 return get_latin1_char((unsigned char)*u);
1446
1447 /* If not empty and not single character, copy the Unicode data
1448 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001449 if (find_maxchar_surrogates(u, u + size,
1450 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 return NULL;
1452
1453 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1454 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 if (!unicode)
1456 return NULL;
1457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 switch (PyUnicode_KIND(unicode)) {
1459 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001460 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1462 break;
1463 case PyUnicode_2BYTE_KIND:
1464#if Py_UNICODE_SIZE == 2
1465 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1466#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001467 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1469#endif
1470 break;
1471 case PyUnicode_4BYTE_KIND:
1472#if SIZEOF_WCHAR_T == 2
1473 /* This is the only case which has to process surrogates, thus
1474 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001475 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476#else
1477 assert(num_surrogates == 0);
1478 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1479#endif
1480 break;
1481 default:
1482 assert(0 && "Impossible state");
1483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001485 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 return (PyObject *)unicode;
1487}
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489PyObject *
1490PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001491{
1492 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001493
Benjamin Peterson14339b62009-01-31 16:36:08 +00001494 if (size < 0) {
1495 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001497 return NULL;
1498 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001499
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001501 some optimizations which share commonly used objects.
1502 Also, this means the input must be UTF-8, so fall back to the
1503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001504 if (u != NULL) {
1505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 /* Optimization for empty strings */
1507 if (size == 0 && unicode_empty != NULL) {
1508 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001509 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001511
1512 /* Single characters are shared when using this constructor.
1513 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 if (size == 1 && Py_CHARMASK(*u) < 128)
1515 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001516
1517 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001518 }
1519
Walter Dörwald55507312007-05-18 13:12:10 +00001520 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001521 if (!unicode)
1522 return NULL;
1523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001524 return (PyObject *)unicode;
1525}
1526
Alexander Belopolsky40018472011-02-26 01:02:56 +00001527PyObject *
1528PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001529{
1530 size_t size = strlen(u);
1531 if (size > PY_SSIZE_T_MAX) {
1532 PyErr_SetString(PyExc_OverflowError, "input too long");
1533 return NULL;
1534 }
1535
1536 return PyUnicode_FromStringAndSize(u, size);
1537}
1538
Victor Stinnere57b1c02011-09-28 22:20:48 +02001539static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001540unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001541{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001542 PyObject *res;
1543#ifdef Py_DEBUG
1544 const unsigned char *p;
1545 const unsigned char *end = s + size;
1546 for (p=s; p < end; p++) {
1547 assert(*p < 128);
1548 }
1549#endif
1550 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001551 if (!res)
1552 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001553 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001554 return res;
1555}
1556
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001557static Py_UCS4
1558kind_maxchar_limit(unsigned int kind)
1559{
1560 switch(kind) {
1561 case PyUnicode_1BYTE_KIND:
1562 return 0x80;
1563 case PyUnicode_2BYTE_KIND:
1564 return 0x100;
1565 case PyUnicode_4BYTE_KIND:
1566 return 0x10000;
1567 default:
1568 assert(0 && "invalid kind");
1569 return 0x10ffff;
1570 }
1571}
1572
Victor Stinner702c7342011-10-05 13:50:52 +02001573static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001574_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001577 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001579
1580 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 for (i = 0; i < size; i++) {
1582 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001583 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001585 }
1586 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001587 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 if (!res)
1589 return NULL;
1590 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001591 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001593}
1594
Victor Stinnere57b1c02011-09-28 22:20:48 +02001595static PyObject*
1596_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597{
1598 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001599 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001601
1602 assert(size >= 0);
1603 for (i = 0; i < size; i++) {
1604 if (u[i] > max_char) {
1605 max_char = u[i];
1606 if (max_char >= 256)
1607 break;
1608 }
1609 }
1610 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 if (!res)
1612 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001613 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1615 else
1616 for (i = 0; i < size; i++)
1617 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001618 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 return res;
1620}
1621
Victor Stinnere57b1c02011-09-28 22:20:48 +02001622static PyObject*
1623_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001624{
1625 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001626 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001628
1629 assert(size >= 0);
1630 for (i = 0; i < size; i++) {
1631 if (u[i] > max_char) {
1632 max_char = u[i];
1633 if (max_char >= 0x10000)
1634 break;
1635 }
1636 }
1637 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 if (!res)
1639 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001640 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1642 else {
1643 int kind = PyUnicode_KIND(res);
1644 void *data = PyUnicode_DATA(res);
1645 for (i = 0; i < size; i++)
1646 PyUnicode_WRITE(kind, data, i, u[i]);
1647 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001648 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 return res;
1650}
1651
1652PyObject*
1653PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1654{
1655 switch(kind) {
1656 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001657 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001659 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001661 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 default:
1663 assert(0 && "invalid kind");
1664 PyErr_SetString(PyExc_SystemError, "invalid kind");
1665 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667}
1668
Victor Stinner034f6cf2011-09-30 02:26:44 +02001669PyObject*
1670PyUnicode_Copy(PyObject *unicode)
1671{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001672 Py_ssize_t size;
1673 PyObject *copy;
1674 void *data;
1675
Victor Stinner034f6cf2011-09-30 02:26:44 +02001676 if (!PyUnicode_Check(unicode)) {
1677 PyErr_BadInternalCall();
1678 return NULL;
1679 }
1680 if (PyUnicode_READY(unicode))
1681 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001682
1683 size = PyUnicode_GET_LENGTH(unicode);
1684 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1685 if (!copy)
1686 return NULL;
1687 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1688
1689 data = PyUnicode_DATA(unicode);
1690 switch (PyUnicode_KIND(unicode))
1691 {
1692 case PyUnicode_1BYTE_KIND:
1693 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1694 break;
1695 case PyUnicode_2BYTE_KIND:
1696 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1697 break;
1698 case PyUnicode_4BYTE_KIND:
1699 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1700 break;
1701 default:
1702 assert(0);
1703 break;
1704 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001705 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001706 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001707}
1708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709
Victor Stinnerbc603d12011-10-02 01:00:40 +02001710/* Widen Unicode objects to larger buffers. Don't write terminating null
1711 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712
1713void*
1714_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1715{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001716 Py_ssize_t len;
1717 void *result;
1718 unsigned int skind;
1719
1720 if (PyUnicode_READY(s))
1721 return NULL;
1722
1723 len = PyUnicode_GET_LENGTH(s);
1724 skind = PyUnicode_KIND(s);
1725 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001726 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 return NULL;
1728 }
1729 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001730 case PyUnicode_2BYTE_KIND:
1731 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1732 if (!result)
1733 return PyErr_NoMemory();
1734 assert(skind == PyUnicode_1BYTE_KIND);
1735 _PyUnicode_CONVERT_BYTES(
1736 Py_UCS1, Py_UCS2,
1737 PyUnicode_1BYTE_DATA(s),
1738 PyUnicode_1BYTE_DATA(s) + len,
1739 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001741 case PyUnicode_4BYTE_KIND:
1742 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1743 if (!result)
1744 return PyErr_NoMemory();
1745 if (skind == PyUnicode_2BYTE_KIND) {
1746 _PyUnicode_CONVERT_BYTES(
1747 Py_UCS2, Py_UCS4,
1748 PyUnicode_2BYTE_DATA(s),
1749 PyUnicode_2BYTE_DATA(s) + len,
1750 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001752 else {
1753 assert(skind == PyUnicode_1BYTE_KIND);
1754 _PyUnicode_CONVERT_BYTES(
1755 Py_UCS1, Py_UCS4,
1756 PyUnicode_1BYTE_DATA(s),
1757 PyUnicode_1BYTE_DATA(s) + len,
1758 result);
1759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001761 default:
1762 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
Victor Stinner01698042011-10-04 00:04:26 +02001764 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 return NULL;
1766}
1767
1768static Py_UCS4*
1769as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1770 int copy_null)
1771{
1772 int kind;
1773 void *data;
1774 Py_ssize_t len, targetlen;
1775 if (PyUnicode_READY(string) == -1)
1776 return NULL;
1777 kind = PyUnicode_KIND(string);
1778 data = PyUnicode_DATA(string);
1779 len = PyUnicode_GET_LENGTH(string);
1780 targetlen = len;
1781 if (copy_null)
1782 targetlen++;
1783 if (!target) {
1784 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1785 PyErr_NoMemory();
1786 return NULL;
1787 }
1788 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1789 if (!target) {
1790 PyErr_NoMemory();
1791 return NULL;
1792 }
1793 }
1794 else {
1795 if (targetsize < targetlen) {
1796 PyErr_Format(PyExc_SystemError,
1797 "string is longer than the buffer");
1798 if (copy_null && 0 < targetsize)
1799 target[0] = 0;
1800 return NULL;
1801 }
1802 }
1803 if (kind != PyUnicode_4BYTE_KIND) {
1804 Py_ssize_t i;
1805 for (i = 0; i < len; i++)
1806 target[i] = PyUnicode_READ(kind, data, i);
1807 }
1808 else
1809 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1810 if (copy_null)
1811 target[len] = 0;
1812 return target;
1813}
1814
1815Py_UCS4*
1816PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1817 int copy_null)
1818{
1819 if (target == NULL || targetsize < 1) {
1820 PyErr_BadInternalCall();
1821 return NULL;
1822 }
1823 return as_ucs4(string, target, targetsize, copy_null);
1824}
1825
1826Py_UCS4*
1827PyUnicode_AsUCS4Copy(PyObject *string)
1828{
1829 return as_ucs4(string, NULL, 0, 1);
1830}
1831
1832#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001833
Alexander Belopolsky40018472011-02-26 01:02:56 +00001834PyObject *
1835PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001838 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001840 PyErr_BadInternalCall();
1841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 }
1843
Martin v. Löwis790465f2008-04-05 20:41:37 +00001844 if (size == -1) {
1845 size = wcslen(w);
1846 }
1847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849}
1850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001852
Walter Dörwald346737f2007-05-31 10:44:43 +00001853static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001854makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1855 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001857 *fmt++ = '%';
1858 if (width) {
1859 if (zeropad)
1860 *fmt++ = '0';
1861 fmt += sprintf(fmt, "%d", width);
1862 }
1863 if (precision)
1864 fmt += sprintf(fmt, ".%d", precision);
1865 if (longflag)
1866 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001867 else if (longlongflag) {
1868 /* longlongflag should only ever be nonzero on machines with
1869 HAVE_LONG_LONG defined */
1870#ifdef HAVE_LONG_LONG
1871 char *f = PY_FORMAT_LONG_LONG;
1872 while (*f)
1873 *fmt++ = *f++;
1874#else
1875 /* we shouldn't ever get here */
1876 assert(0);
1877 *fmt++ = 'l';
1878#endif
1879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001880 else if (size_tflag) {
1881 char *f = PY_FORMAT_SIZE_T;
1882 while (*f)
1883 *fmt++ = *f++;
1884 }
1885 *fmt++ = c;
1886 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001887}
1888
Victor Stinner96865452011-03-01 23:44:09 +00001889/* helper for PyUnicode_FromFormatV() */
1890
1891static const char*
1892parse_format_flags(const char *f,
1893 int *p_width, int *p_precision,
1894 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1895{
1896 int width, precision, longflag, longlongflag, size_tflag;
1897
1898 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1899 f++;
1900 width = 0;
1901 while (Py_ISDIGIT((unsigned)*f))
1902 width = (width*10) + *f++ - '0';
1903 precision = 0;
1904 if (*f == '.') {
1905 f++;
1906 while (Py_ISDIGIT((unsigned)*f))
1907 precision = (precision*10) + *f++ - '0';
1908 if (*f == '%') {
1909 /* "%.3%s" => f points to "3" */
1910 f--;
1911 }
1912 }
1913 if (*f == '\0') {
1914 /* bogus format "%.1" => go backward, f points to "1" */
1915 f--;
1916 }
1917 if (p_width != NULL)
1918 *p_width = width;
1919 if (p_precision != NULL)
1920 *p_precision = precision;
1921
1922 /* Handle %ld, %lu, %lld and %llu. */
1923 longflag = 0;
1924 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001925 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001926
1927 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001928 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001929 longflag = 1;
1930 ++f;
1931 }
1932#ifdef HAVE_LONG_LONG
1933 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001934 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001935 longlongflag = 1;
1936 f += 2;
1937 }
1938#endif
1939 }
1940 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001941 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001942 size_tflag = 1;
1943 ++f;
1944 }
1945 if (p_longflag != NULL)
1946 *p_longflag = longflag;
1947 if (p_longlongflag != NULL)
1948 *p_longlongflag = longlongflag;
1949 if (p_size_tflag != NULL)
1950 *p_size_tflag = size_tflag;
1951 return f;
1952}
1953
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001954/* maximum number of characters required for output of %ld. 21 characters
1955 allows for 64-bit integers (in decimal) and an optional sign. */
1956#define MAX_LONG_CHARS 21
1957/* maximum number of characters required for output of %lld.
1958 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1959 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1960#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1961
Walter Dörwaldd2034312007-05-18 16:29:38 +00001962PyObject *
1963PyUnicode_FromFormatV(const char *format, va_list vargs)
1964{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001965 va_list count;
1966 Py_ssize_t callcount = 0;
1967 PyObject **callresults = NULL;
1968 PyObject **callresult = NULL;
1969 Py_ssize_t n = 0;
1970 int width = 0;
1971 int precision = 0;
1972 int zeropad;
1973 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001975 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001976 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1978 Py_UCS4 argmaxchar;
1979 Py_ssize_t numbersize = 0;
1980 char *numberresults = NULL;
1981 char *numberresult = NULL;
1982 Py_ssize_t i;
1983 int kind;
1984 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001985
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001986 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001987 /* step 1: count the number of %S/%R/%A/%s format specifications
1988 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1989 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02001991 * also estimate a upper bound for all the number formats in the string,
1992 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001994 for (f = format; *f; f++) {
1995 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001996 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1998 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1999 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2000 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002003#ifdef HAVE_LONG_LONG
2004 if (longlongflag) {
2005 if (width < MAX_LONG_LONG_CHARS)
2006 width = MAX_LONG_LONG_CHARS;
2007 }
2008 else
2009#endif
2010 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2011 including sign. Decimal takes the most space. This
2012 isn't enough for octal. If a width is specified we
2013 need more (which we allocate later). */
2014 if (width < MAX_LONG_CHARS)
2015 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016
2017 /* account for the size + '\0' to separate numbers
2018 inside of the numberresults buffer */
2019 numbersize += (width + 1);
2020 }
2021 }
2022 else if ((unsigned char)*f > 127) {
2023 PyErr_Format(PyExc_ValueError,
2024 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2025 "string, got a non-ASCII byte: 0x%02x",
2026 (unsigned char)*f);
2027 return NULL;
2028 }
2029 }
2030 /* step 2: allocate memory for the results of
2031 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2032 if (callcount) {
2033 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2034 if (!callresults) {
2035 PyErr_NoMemory();
2036 return NULL;
2037 }
2038 callresult = callresults;
2039 }
2040 /* step 2.5: allocate memory for the results of formating numbers */
2041 if (numbersize) {
2042 numberresults = PyObject_Malloc(numbersize);
2043 if (!numberresults) {
2044 PyErr_NoMemory();
2045 goto fail;
2046 }
2047 numberresult = numberresults;
2048 }
2049
2050 /* step 3: format numbers and figure out how large a buffer we need */
2051 for (f = format; *f; f++) {
2052 if (*f == '%') {
2053 const char* p;
2054 int longflag;
2055 int longlongflag;
2056 int size_tflag;
2057 int numprinted;
2058
2059 p = f;
2060 zeropad = (f[1] == '0');
2061 f = parse_format_flags(f, &width, &precision,
2062 &longflag, &longlongflag, &size_tflag);
2063 switch (*f) {
2064 case 'c':
2065 {
2066 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002067 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 n++;
2069 break;
2070 }
2071 case '%':
2072 n++;
2073 break;
2074 case 'i':
2075 case 'd':
2076 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2077 width, precision, *f);
2078 if (longflag)
2079 numprinted = sprintf(numberresult, fmt,
2080 va_arg(count, long));
2081#ifdef HAVE_LONG_LONG
2082 else if (longlongflag)
2083 numprinted = sprintf(numberresult, fmt,
2084 va_arg(count, PY_LONG_LONG));
2085#endif
2086 else if (size_tflag)
2087 numprinted = sprintf(numberresult, fmt,
2088 va_arg(count, Py_ssize_t));
2089 else
2090 numprinted = sprintf(numberresult, fmt,
2091 va_arg(count, int));
2092 n += numprinted;
2093 /* advance by +1 to skip over the '\0' */
2094 numberresult += (numprinted + 1);
2095 assert(*(numberresult - 1) == '\0');
2096 assert(*(numberresult - 2) != '\0');
2097 assert(numprinted >= 0);
2098 assert(numberresult <= numberresults + numbersize);
2099 break;
2100 case 'u':
2101 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2102 width, precision, 'u');
2103 if (longflag)
2104 numprinted = sprintf(numberresult, fmt,
2105 va_arg(count, unsigned long));
2106#ifdef HAVE_LONG_LONG
2107 else if (longlongflag)
2108 numprinted = sprintf(numberresult, fmt,
2109 va_arg(count, unsigned PY_LONG_LONG));
2110#endif
2111 else if (size_tflag)
2112 numprinted = sprintf(numberresult, fmt,
2113 va_arg(count, size_t));
2114 else
2115 numprinted = sprintf(numberresult, fmt,
2116 va_arg(count, unsigned int));
2117 n += numprinted;
2118 numberresult += (numprinted + 1);
2119 assert(*(numberresult - 1) == '\0');
2120 assert(*(numberresult - 2) != '\0');
2121 assert(numprinted >= 0);
2122 assert(numberresult <= numberresults + numbersize);
2123 break;
2124 case 'x':
2125 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2126 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2127 n += numprinted;
2128 numberresult += (numprinted + 1);
2129 assert(*(numberresult - 1) == '\0');
2130 assert(*(numberresult - 2) != '\0');
2131 assert(numprinted >= 0);
2132 assert(numberresult <= numberresults + numbersize);
2133 break;
2134 case 'p':
2135 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2136 /* %p is ill-defined: ensure leading 0x. */
2137 if (numberresult[1] == 'X')
2138 numberresult[1] = 'x';
2139 else if (numberresult[1] != 'x') {
2140 memmove(numberresult + 2, numberresult,
2141 strlen(numberresult) + 1);
2142 numberresult[0] = '0';
2143 numberresult[1] = 'x';
2144 numprinted += 2;
2145 }
2146 n += numprinted;
2147 numberresult += (numprinted + 1);
2148 assert(*(numberresult - 1) == '\0');
2149 assert(*(numberresult - 2) != '\0');
2150 assert(numprinted >= 0);
2151 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 break;
2153 case 's':
2154 {
2155 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002156 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002157 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2158 if (!str)
2159 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 /* since PyUnicode_DecodeUTF8 returns already flexible
2161 unicode objects, there is no need to call ready on them */
2162 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002163 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002165 /* Remember the str and switch to the next slot */
2166 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 break;
2168 }
2169 case 'U':
2170 {
2171 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002172 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (PyUnicode_READY(obj) == -1)
2174 goto fail;
2175 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002176 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002178 break;
2179 }
2180 case 'V':
2181 {
2182 PyObject *obj = va_arg(count, PyObject *);
2183 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002184 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002186 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002187 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 if (PyUnicode_READY(obj) == -1)
2189 goto fail;
2190 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002191 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002193 *callresult++ = NULL;
2194 }
2195 else {
2196 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2197 if (!str_obj)
2198 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002199 if (PyUnicode_READY(str_obj)) {
2200 Py_DECREF(str_obj);
2201 goto fail;
2202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002204 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002206 *callresult++ = str_obj;
2207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002208 break;
2209 }
2210 case 'S':
2211 {
2212 PyObject *obj = va_arg(count, PyObject *);
2213 PyObject *str;
2214 assert(obj);
2215 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002219 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 /* Remember the str and switch to the next slot */
2222 *callresult++ = str;
2223 break;
2224 }
2225 case 'R':
2226 {
2227 PyObject *obj = va_arg(count, PyObject *);
2228 PyObject *repr;
2229 assert(obj);
2230 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002234 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 /* Remember the repr and switch to the next slot */
2237 *callresult++ = repr;
2238 break;
2239 }
2240 case 'A':
2241 {
2242 PyObject *obj = va_arg(count, PyObject *);
2243 PyObject *ascii;
2244 assert(obj);
2245 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002249 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 /* Remember the repr and switch to the next slot */
2252 *callresult++ = ascii;
2253 break;
2254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 default:
2256 /* if we stumble upon an unknown
2257 formatting code, copy the rest of
2258 the format string to the output
2259 string. (we cannot just skip the
2260 code, since there's no way to know
2261 what's in the argument list) */
2262 n += strlen(p);
2263 goto expand;
2264 }
2265 } else
2266 n++;
2267 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002268 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 we don't have to resize the string.
2272 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 if (!string)
2275 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 kind = PyUnicode_KIND(string);
2277 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002283 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002284
2285 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2287 /* checking for == because the last argument could be a empty
2288 string, which causes i to point to end, the assert at the end of
2289 the loop */
2290 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002291
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 switch (*f) {
2293 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002294 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 const int ordinal = va_arg(vargs, int);
2296 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002297 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002298 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002299 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002301 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002302 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 case 'p':
2304 /* unused, since we already have the result */
2305 if (*f == 'p')
2306 (void) va_arg(vargs, void *);
2307 else
2308 (void) va_arg(vargs, int);
2309 /* extract the result from numberresults and append. */
2310 for (; *numberresult; ++i, ++numberresult)
2311 PyUnicode_WRITE(kind, data, i, *numberresult);
2312 /* skip over the separating '\0' */
2313 assert(*numberresult == '\0');
2314 numberresult++;
2315 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 break;
2317 case 's':
2318 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002319 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002321 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 size = PyUnicode_GET_LENGTH(*callresult);
2323 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002324 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2325 *callresult, 0,
2326 size) < 0)
2327 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002329 /* We're done with the unicode()/repr() => forget it */
2330 Py_DECREF(*callresult);
2331 /* switch to next unicode()/repr() result */
2332 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002333 break;
2334 }
2335 case 'U':
2336 {
2337 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 Py_ssize_t size;
2339 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2340 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002341 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2342 obj, 0,
2343 size) < 0)
2344 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002346 break;
2347 }
2348 case 'V':
2349 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002351 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002352 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 size = PyUnicode_GET_LENGTH(obj);
2355 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002356 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2357 obj, 0,
2358 size) < 0)
2359 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 size = PyUnicode_GET_LENGTH(*callresult);
2363 assert(PyUnicode_KIND(*callresult) <=
2364 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002365 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2366 *callresult,
2367 0, size) < 0)
2368 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002370 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002371 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002372 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002373 break;
2374 }
2375 case 'S':
2376 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002377 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002378 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 /* unused, since we already have the result */
2380 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002382 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2383 *callresult, 0,
2384 PyUnicode_GET_LENGTH(*callresult)) < 0)
2385 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 /* We're done with the unicode()/repr() => forget it */
2388 Py_DECREF(*callresult);
2389 /* switch to next unicode()/repr() result */
2390 ++callresult;
2391 break;
2392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002393 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002395 break;
2396 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 for (; *p; ++p, ++i)
2398 PyUnicode_WRITE(kind, data, i, *p);
2399 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002400 goto end;
2401 }
Victor Stinner1205f272010-09-11 00:54:47 +00002402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 else {
2404 assert(i < PyUnicode_GET_LENGTH(string));
2405 PyUnicode_WRITE(kind, data, i++, *f);
2406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002409
Benjamin Peterson29060642009-01-31 22:14:21 +00002410 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002411 if (callresults)
2412 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 if (numberresults)
2414 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002415 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002417 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002418 if (callresults) {
2419 PyObject **callresult2 = callresults;
2420 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002421 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 ++callresult2;
2423 }
2424 PyObject_Free(callresults);
2425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 if (numberresults)
2427 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002428 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002429}
2430
Walter Dörwaldd2034312007-05-18 16:29:38 +00002431PyObject *
2432PyUnicode_FromFormat(const char *format, ...)
2433{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 PyObject* ret;
2435 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002436
2437#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002438 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002439#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002441#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 ret = PyUnicode_FromFormatV(format, vargs);
2443 va_end(vargs);
2444 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002445}
2446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447#ifdef HAVE_WCHAR_H
2448
Victor Stinner5593d8a2010-10-02 11:11:27 +00002449/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2450 convert a Unicode object to a wide character string.
2451
Victor Stinnerd88d9832011-09-06 02:00:05 +02002452 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002453 character) required to convert the unicode object. Ignore size argument.
2454
Victor Stinnerd88d9832011-09-06 02:00:05 +02002455 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002456 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002457 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002458static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002459unicode_aswidechar(PyUnicodeObject *unicode,
2460 wchar_t *w,
2461 Py_ssize_t size)
2462{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002463 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 const wchar_t *wstr;
2465
2466 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2467 if (wstr == NULL)
2468 return -1;
2469
Victor Stinner5593d8a2010-10-02 11:11:27 +00002470 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002471 if (size > res)
2472 size = res + 1;
2473 else
2474 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002476 return res;
2477 }
2478 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002480}
2481
2482Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002483PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002484 wchar_t *w,
2485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486{
2487 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002491 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
Victor Stinner137c34c2010-09-29 10:25:54 +00002494wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002495PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002496 Py_ssize_t *size)
2497{
2498 wchar_t* buffer;
2499 Py_ssize_t buflen;
2500
2501 if (unicode == NULL) {
2502 PyErr_BadInternalCall();
2503 return NULL;
2504 }
2505
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002506 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 if (buflen == -1)
2508 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002509 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002510 PyErr_NoMemory();
2511 return NULL;
2512 }
2513
Victor Stinner137c34c2010-09-29 10:25:54 +00002514 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2515 if (buffer == NULL) {
2516 PyErr_NoMemory();
2517 return NULL;
2518 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002519 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 if (buflen == -1)
2521 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002522 if (size != NULL)
2523 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002524 return buffer;
2525}
2526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
Alexander Belopolsky40018472011-02-26 01:02:56 +00002529PyObject *
2530PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002533 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002534 PyErr_SetString(PyExc_ValueError,
2535 "chr() arg not in range(0x110000)");
2536 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002537 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 if (ordinal < 256)
2540 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 v = PyUnicode_New(1, ordinal);
2543 if (v == NULL)
2544 return NULL;
2545 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002546 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002548}
2549
Alexander Belopolsky40018472011-02-26 01:02:56 +00002550PyObject *
2551PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002553 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002555 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002556 if (PyUnicode_READY(obj))
2557 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002558 Py_INCREF(obj);
2559 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002560 }
2561 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002562 /* For a Unicode subtype that's not a Unicode object,
2563 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002564 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002565 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002566 PyErr_Format(PyExc_TypeError,
2567 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002568 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002569 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002570}
2571
Alexander Belopolsky40018472011-02-26 01:02:56 +00002572PyObject *
2573PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002574 const char *encoding,
2575 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002576{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002577 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002578 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002579
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 PyErr_BadInternalCall();
2582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002584
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002585 /* Decoding bytes objects is the most common case and should be fast */
2586 if (PyBytes_Check(obj)) {
2587 if (PyBytes_GET_SIZE(obj) == 0) {
2588 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002589 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002590 }
2591 else {
2592 v = PyUnicode_Decode(
2593 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2594 encoding, errors);
2595 }
2596 return v;
2597 }
2598
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002599 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 PyErr_SetString(PyExc_TypeError,
2601 "decoding str is not supported");
2602 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002604
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002605 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2606 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2607 PyErr_Format(PyExc_TypeError,
2608 "coercing to str: need bytes, bytearray "
2609 "or buffer-like object, %.80s found",
2610 Py_TYPE(obj)->tp_name);
2611 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002612 }
Tim Petersced69f82003-09-16 20:30:58 +00002613
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002614 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002615 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002616 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 }
Tim Petersced69f82003-09-16 20:30:58 +00002618 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002619 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002620
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002621 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002622 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623}
2624
Victor Stinner600d3be2010-06-10 12:00:55 +00002625/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002626 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2627 1 on success. */
2628static int
2629normalize_encoding(const char *encoding,
2630 char *lower,
2631 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002633 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002634 char *l;
2635 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002637 e = encoding;
2638 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002639 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002640 while (*e) {
2641 if (l == l_end)
2642 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002643 if (Py_ISUPPER(*e)) {
2644 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002645 }
2646 else if (*e == '_') {
2647 *l++ = '-';
2648 e++;
2649 }
2650 else {
2651 *l++ = *e++;
2652 }
2653 }
2654 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002655 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002656}
2657
Alexander Belopolsky40018472011-02-26 01:02:56 +00002658PyObject *
2659PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002660 Py_ssize_t size,
2661 const char *encoding,
2662 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002663{
2664 PyObject *buffer = NULL, *unicode;
2665 Py_buffer info;
2666 char lower[11]; /* Enough for any encoding shortcut */
2667
2668 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002669 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002670
2671 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002672 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002673 if ((strcmp(lower, "utf-8") == 0) ||
2674 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002675 return PyUnicode_DecodeUTF8(s, size, errors);
2676 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002677 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002678 (strcmp(lower, "iso-8859-1") == 0))
2679 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002680#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002681 else if (strcmp(lower, "mbcs") == 0)
2682 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002683#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002684 else if (strcmp(lower, "ascii") == 0)
2685 return PyUnicode_DecodeASCII(s, size, errors);
2686 else if (strcmp(lower, "utf-16") == 0)
2687 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2688 else if (strcmp(lower, "utf-32") == 0)
2689 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691
2692 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002693 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002694 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002695 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002696 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 if (buffer == NULL)
2698 goto onError;
2699 unicode = PyCodec_Decode(buffer, encoding, errors);
2700 if (unicode == NULL)
2701 goto onError;
2702 if (!PyUnicode_Check(unicode)) {
2703 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002704 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002705 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 Py_DECREF(unicode);
2707 goto onError;
2708 }
2709 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002710#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002711 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 Py_DECREF(unicode);
2713 return NULL;
2714 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002715#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002716 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002718
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 Py_XDECREF(buffer);
2721 return NULL;
2722}
2723
Alexander Belopolsky40018472011-02-26 01:02:56 +00002724PyObject *
2725PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002726 const char *encoding,
2727 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002728{
2729 PyObject *v;
2730
2731 if (!PyUnicode_Check(unicode)) {
2732 PyErr_BadArgument();
2733 goto onError;
2734 }
2735
2736 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002738
2739 /* Decode via the codec registry */
2740 v = PyCodec_Decode(unicode, encoding, errors);
2741 if (v == NULL)
2742 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002743 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002744 return v;
2745
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002747 return NULL;
2748}
2749
Alexander Belopolsky40018472011-02-26 01:02:56 +00002750PyObject *
2751PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002752 const char *encoding,
2753 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002754{
2755 PyObject *v;
2756
2757 if (!PyUnicode_Check(unicode)) {
2758 PyErr_BadArgument();
2759 goto onError;
2760 }
2761
2762 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002764
2765 /* Decode via the codec registry */
2766 v = PyCodec_Decode(unicode, encoding, errors);
2767 if (v == NULL)
2768 goto onError;
2769 if (!PyUnicode_Check(v)) {
2770 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002771 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002772 Py_TYPE(v)->tp_name);
2773 Py_DECREF(v);
2774 goto onError;
2775 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002776 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002777 return v;
2778
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002780 return NULL;
2781}
2782
Alexander Belopolsky40018472011-02-26 01:02:56 +00002783PyObject *
2784PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002785 Py_ssize_t size,
2786 const char *encoding,
2787 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788{
2789 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002790
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 unicode = PyUnicode_FromUnicode(s, size);
2792 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2795 Py_DECREF(unicode);
2796 return v;
2797}
2798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799PyObject *
2800PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002801 const char *encoding,
2802 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002803{
2804 PyObject *v;
2805
2806 if (!PyUnicode_Check(unicode)) {
2807 PyErr_BadArgument();
2808 goto onError;
2809 }
2810
2811 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002813
2814 /* Encode via the codec registry */
2815 v = PyCodec_Encode(unicode, encoding, errors);
2816 if (v == NULL)
2817 goto onError;
2818 return v;
2819
Benjamin Peterson29060642009-01-31 22:14:21 +00002820 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002821 return NULL;
2822}
2823
Victor Stinnerad158722010-10-27 00:25:46 +00002824PyObject *
2825PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002826{
Victor Stinner99b95382011-07-04 14:23:54 +02002827#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002828 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002833#else
Victor Stinner793b5312011-04-27 00:24:21 +02002834 PyInterpreterState *interp = PyThreadState_GET()->interp;
2835 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2836 cannot use it to encode and decode filenames before it is loaded. Load
2837 the Python codec requires to encode at least its own filename. Use the C
2838 version of the locale codec until the codec registry is initialized and
2839 the Python codec is loaded.
2840
2841 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2842 cannot only rely on it: check also interp->fscodec_initialized for
2843 subinterpreters. */
2844 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002845 return PyUnicode_AsEncodedString(unicode,
2846 Py_FileSystemDefaultEncoding,
2847 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002848 }
2849 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002850 /* locale encoding with surrogateescape */
2851 wchar_t *wchar;
2852 char *bytes;
2853 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002854 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002855
2856 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2857 if (wchar == NULL)
2858 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002859 bytes = _Py_wchar2char(wchar, &error_pos);
2860 if (bytes == NULL) {
2861 if (error_pos != (size_t)-1) {
2862 char *errmsg = strerror(errno);
2863 PyObject *exc = NULL;
2864 if (errmsg == NULL)
2865 errmsg = "Py_wchar2char() failed";
2866 raise_encode_exception(&exc,
2867 "filesystemencoding",
2868 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2869 error_pos, error_pos+1,
2870 errmsg);
2871 Py_XDECREF(exc);
2872 }
2873 else
2874 PyErr_NoMemory();
2875 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002876 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002877 }
2878 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002879
2880 bytes_obj = PyBytes_FromString(bytes);
2881 PyMem_Free(bytes);
2882 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002883 }
Victor Stinnerad158722010-10-27 00:25:46 +00002884#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
2888PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002889 const char *encoding,
2890 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891{
2892 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002893 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002894
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 if (!PyUnicode_Check(unicode)) {
2896 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 }
Fred Drakee4315f52000-05-09 19:53:39 +00002899
Victor Stinner2f283c22011-03-02 01:21:46 +00002900 if (encoding == NULL) {
2901 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002903 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002904 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002905 }
Fred Drakee4315f52000-05-09 19:53:39 +00002906
2907 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002908 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002909 if ((strcmp(lower, "utf-8") == 0) ||
2910 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002911 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002912 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002913 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002914 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002915 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002916 }
Victor Stinner37296e82010-06-10 13:36:23 +00002917 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002918 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002919 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002921#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002922 else if (strcmp(lower, "mbcs") == 0)
2923 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2924 PyUnicode_GET_SIZE(unicode),
2925 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002926#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002927 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002928 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 /* Encode via the codec registry */
2932 v = PyCodec_Encode(unicode, encoding, errors);
2933 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002934 return NULL;
2935
2936 /* The normal path */
2937 if (PyBytes_Check(v))
2938 return v;
2939
2940 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002941 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002942 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002943 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002944
2945 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2946 "encoder %s returned bytearray instead of bytes",
2947 encoding);
2948 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002949 Py_DECREF(v);
2950 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002951 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002952
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002953 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2954 Py_DECREF(v);
2955 return b;
2956 }
2957
2958 PyErr_Format(PyExc_TypeError,
2959 "encoder did not return a bytes object (type=%.400s)",
2960 Py_TYPE(v)->tp_name);
2961 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002962 return NULL;
2963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002967 const char *encoding,
2968 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002969{
2970 PyObject *v;
2971
2972 if (!PyUnicode_Check(unicode)) {
2973 PyErr_BadArgument();
2974 goto onError;
2975 }
2976
2977 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002979
2980 /* Encode via the codec registry */
2981 v = PyCodec_Encode(unicode, encoding, errors);
2982 if (v == NULL)
2983 goto onError;
2984 if (!PyUnicode_Check(v)) {
2985 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002986 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002987 Py_TYPE(v)->tp_name);
2988 Py_DECREF(v);
2989 goto onError;
2990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002992
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 return NULL;
2995}
2996
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002997PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002998PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002999 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003000 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3001}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003002
Christian Heimes5894ba72007-11-04 11:43:14 +00003003PyObject*
3004PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3005{
Victor Stinner99b95382011-07-04 14:23:54 +02003006#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003007 return PyUnicode_DecodeMBCS(s, size, NULL);
3008#elif defined(__APPLE__)
3009 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3010#else
Victor Stinner793b5312011-04-27 00:24:21 +02003011 PyInterpreterState *interp = PyThreadState_GET()->interp;
3012 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3013 cannot use it to encode and decode filenames before it is loaded. Load
3014 the Python codec requires to encode at least its own filename. Use the C
3015 version of the locale codec until the codec registry is initialized and
3016 the Python codec is loaded.
3017
3018 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3019 cannot only rely on it: check also interp->fscodec_initialized for
3020 subinterpreters. */
3021 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003022 return PyUnicode_Decode(s, size,
3023 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003024 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003025 }
3026 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003027 /* locale encoding with surrogateescape */
3028 wchar_t *wchar;
3029 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003030 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003031
3032 if (s[size] != '\0' || size != strlen(s)) {
3033 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3034 return NULL;
3035 }
3036
Victor Stinner168e1172010-10-16 23:16:16 +00003037 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003038 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003039 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003040
Victor Stinner168e1172010-10-16 23:16:16 +00003041 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003042 PyMem_Free(wchar);
3043 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003044 }
Victor Stinnerad158722010-10-27 00:25:46 +00003045#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003046}
3047
Martin v. Löwis011e8422009-05-05 04:43:17 +00003048
3049int
3050PyUnicode_FSConverter(PyObject* arg, void* addr)
3051{
3052 PyObject *output = NULL;
3053 Py_ssize_t size;
3054 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003055 if (arg == NULL) {
3056 Py_DECREF(*(PyObject**)addr);
3057 return 1;
3058 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003059 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003060 output = arg;
3061 Py_INCREF(output);
3062 }
3063 else {
3064 arg = PyUnicode_FromObject(arg);
3065 if (!arg)
3066 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003067 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003068 Py_DECREF(arg);
3069 if (!output)
3070 return 0;
3071 if (!PyBytes_Check(output)) {
3072 Py_DECREF(output);
3073 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3074 return 0;
3075 }
3076 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003077 size = PyBytes_GET_SIZE(output);
3078 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003079 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003080 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003081 Py_DECREF(output);
3082 return 0;
3083 }
3084 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003085 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003086}
3087
3088
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003089int
3090PyUnicode_FSDecoder(PyObject* arg, void* addr)
3091{
3092 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003093 if (arg == NULL) {
3094 Py_DECREF(*(PyObject**)addr);
3095 return 1;
3096 }
3097 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098 if (PyUnicode_READY(arg))
3099 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003100 output = arg;
3101 Py_INCREF(output);
3102 }
3103 else {
3104 arg = PyBytes_FromObject(arg);
3105 if (!arg)
3106 return 0;
3107 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3108 PyBytes_GET_SIZE(arg));
3109 Py_DECREF(arg);
3110 if (!output)
3111 return 0;
3112 if (!PyUnicode_Check(output)) {
3113 Py_DECREF(output);
3114 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3115 return 0;
3116 }
3117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3119 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003120 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3121 Py_DECREF(output);
3122 return 0;
3123 }
3124 *(PyObject**)addr = output;
3125 return Py_CLEANUP_SUPPORTED;
3126}
3127
3128
Martin v. Löwis5b222132007-06-10 09:51:05 +00003129char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003130PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003131{
Christian Heimesf3863112007-11-22 07:46:41 +00003132 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003133 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3134
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003135 if (!PyUnicode_Check(unicode)) {
3136 PyErr_BadArgument();
3137 return NULL;
3138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003139 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003140 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003142 if (PyUnicode_UTF8(unicode) == NULL) {
3143 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3145 if (bytes == NULL)
3146 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003147 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3148 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003149 Py_DECREF(bytes);
3150 return NULL;
3151 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003152 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3153 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003154 Py_DECREF(bytes);
3155 }
3156
3157 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003158 *psize = PyUnicode_UTF8_LENGTH(unicode);
3159 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003160}
3161
3162char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003165 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3166}
3167
3168#ifdef Py_DEBUG
3169int unicode_as_unicode_calls = 0;
3170#endif
3171
3172
3173Py_UNICODE *
3174PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3175{
3176 PyUnicodeObject *u;
3177 const unsigned char *one_byte;
3178#if SIZEOF_WCHAR_T == 4
3179 const Py_UCS2 *two_bytes;
3180#else
3181 const Py_UCS4 *four_bytes;
3182 const Py_UCS4 *ucs4_end;
3183 Py_ssize_t num_surrogates;
3184#endif
3185 wchar_t *w;
3186 wchar_t *wchar_end;
3187
3188 if (!PyUnicode_Check(unicode)) {
3189 PyErr_BadArgument();
3190 return NULL;
3191 }
3192 u = (PyUnicodeObject*)unicode;
3193 if (_PyUnicode_WSTR(u) == NULL) {
3194 /* Non-ASCII compact unicode object */
3195 assert(_PyUnicode_KIND(u) != 0);
3196 assert(PyUnicode_IS_READY(u));
3197
3198#ifdef Py_DEBUG
3199 ++unicode_as_unicode_calls;
3200#endif
3201
3202 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3203#if SIZEOF_WCHAR_T == 2
3204 four_bytes = PyUnicode_4BYTE_DATA(u);
3205 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3206 num_surrogates = 0;
3207
3208 for (; four_bytes < ucs4_end; ++four_bytes) {
3209 if (*four_bytes > 0xFFFF)
3210 ++num_surrogates;
3211 }
3212
3213 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3214 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3215 if (!_PyUnicode_WSTR(u)) {
3216 PyErr_NoMemory();
3217 return NULL;
3218 }
3219 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3220
3221 w = _PyUnicode_WSTR(u);
3222 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3223 four_bytes = PyUnicode_4BYTE_DATA(u);
3224 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3225 if (*four_bytes > 0xFFFF) {
3226 /* encode surrogate pair in this case */
3227 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3228 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3229 }
3230 else
3231 *w = *four_bytes;
3232
3233 if (w > wchar_end) {
3234 assert(0 && "Miscalculated string end");
3235 }
3236 }
3237 *w = 0;
3238#else
3239 /* sizeof(wchar_t) == 4 */
3240 Py_FatalError("Impossible unicode object state, wstr and str "
3241 "should share memory already.");
3242 return NULL;
3243#endif
3244 }
3245 else {
3246 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3247 (_PyUnicode_LENGTH(u) + 1));
3248 if (!_PyUnicode_WSTR(u)) {
3249 PyErr_NoMemory();
3250 return NULL;
3251 }
3252 if (!PyUnicode_IS_COMPACT_ASCII(u))
3253 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3254 w = _PyUnicode_WSTR(u);
3255 wchar_end = w + _PyUnicode_LENGTH(u);
3256
3257 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3258 one_byte = PyUnicode_1BYTE_DATA(u);
3259 for (; w < wchar_end; ++one_byte, ++w)
3260 *w = *one_byte;
3261 /* null-terminate the wstr */
3262 *w = 0;
3263 }
3264 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3265#if SIZEOF_WCHAR_T == 4
3266 two_bytes = PyUnicode_2BYTE_DATA(u);
3267 for (; w < wchar_end; ++two_bytes, ++w)
3268 *w = *two_bytes;
3269 /* null-terminate the wstr */
3270 *w = 0;
3271#else
3272 /* sizeof(wchar_t) == 2 */
3273 PyObject_FREE(_PyUnicode_WSTR(u));
3274 _PyUnicode_WSTR(u) = NULL;
3275 Py_FatalError("Impossible unicode object state, wstr "
3276 "and str should share memory already.");
3277 return NULL;
3278#endif
3279 }
3280 else {
3281 assert(0 && "This should never happen.");
3282 }
3283 }
3284 }
3285 if (size != NULL)
3286 *size = PyUnicode_WSTR_LENGTH(u);
3287 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003288}
3289
Alexander Belopolsky40018472011-02-26 01:02:56 +00003290Py_UNICODE *
3291PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003293 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294}
3295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003296
Alexander Belopolsky40018472011-02-26 01:02:56 +00003297Py_ssize_t
3298PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299{
3300 if (!PyUnicode_Check(unicode)) {
3301 PyErr_BadArgument();
3302 goto onError;
3303 }
3304 return PyUnicode_GET_SIZE(unicode);
3305
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 return -1;
3308}
3309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310Py_ssize_t
3311PyUnicode_GetLength(PyObject *unicode)
3312{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003313 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003314 PyErr_BadArgument();
3315 return -1;
3316 }
3317
3318 return PyUnicode_GET_LENGTH(unicode);
3319}
3320
3321Py_UCS4
3322PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3323{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003324 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3325 PyErr_BadArgument();
3326 return (Py_UCS4)-1;
3327 }
3328 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3329 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 return (Py_UCS4)-1;
3331 }
3332 return PyUnicode_READ_CHAR(unicode, index);
3333}
3334
3335int
3336PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3337{
3338 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003339 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 return -1;
3341 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003342 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3343 PyErr_SetString(PyExc_IndexError, "string index out of range");
3344 return -1;
3345 }
3346 if (_PyUnicode_Dirty(unicode))
3347 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3349 index, ch);
3350 return 0;
3351}
3352
Alexander Belopolsky40018472011-02-26 01:02:56 +00003353const char *
3354PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003355{
Victor Stinner42cb4622010-09-01 19:39:01 +00003356 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003357}
3358
Victor Stinner554f3f02010-06-16 23:33:54 +00003359/* create or adjust a UnicodeDecodeError */
3360static void
3361make_decode_exception(PyObject **exceptionObject,
3362 const char *encoding,
3363 const char *input, Py_ssize_t length,
3364 Py_ssize_t startpos, Py_ssize_t endpos,
3365 const char *reason)
3366{
3367 if (*exceptionObject == NULL) {
3368 *exceptionObject = PyUnicodeDecodeError_Create(
3369 encoding, input, length, startpos, endpos, reason);
3370 }
3371 else {
3372 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3373 goto onError;
3374 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3375 goto onError;
3376 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3377 goto onError;
3378 }
3379 return;
3380
3381onError:
3382 Py_DECREF(*exceptionObject);
3383 *exceptionObject = NULL;
3384}
3385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386/* error handling callback helper:
3387 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003388 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 and adjust various state variables.
3390 return 0 on success, -1 on error
3391*/
3392
Alexander Belopolsky40018472011-02-26 01:02:56 +00003393static int
3394unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003395 const char *encoding, const char *reason,
3396 const char **input, const char **inend, Py_ssize_t *startinpos,
3397 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3398 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003400 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401
3402 PyObject *restuple = NULL;
3403 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003404 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003405 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003406 Py_ssize_t requiredsize;
3407 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003409 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003410 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 int res = -1;
3412
3413 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003414 *errorHandler = PyCodec_LookupError(errors);
3415 if (*errorHandler == NULL)
3416 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 }
3418
Victor Stinner554f3f02010-06-16 23:33:54 +00003419 make_decode_exception(exceptionObject,
3420 encoding,
3421 *input, *inend - *input,
3422 *startinpos, *endinpos,
3423 reason);
3424 if (*exceptionObject == NULL)
3425 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426
3427 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3428 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003431 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 }
3434 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003436
3437 /* Copy back the bytes variables, which might have been modified by the
3438 callback */
3439 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3440 if (!inputobj)
3441 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003442 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003444 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003445 *input = PyBytes_AS_STRING(inputobj);
3446 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003447 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003448 /* we can DECREF safely, as the exception has another reference,
3449 so the object won't go away. */
3450 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003454 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3456 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458
3459 /* need more space? (at least enough for what we
3460 have+the replacement+the rest of the string (starting
3461 at the new input position), so we won't have to check space
3462 when there are no errors in the rest of the string) */
3463 repptr = PyUnicode_AS_UNICODE(repunicode);
3464 repsize = PyUnicode_GET_SIZE(repunicode);
3465 requiredsize = *outpos + repsize + insize-newpos;
3466 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 if (requiredsize<2*outsize)
3468 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003469 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 goto onError;
3471 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 }
3473 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003474 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 Py_UNICODE_COPY(*outptr, repptr, repsize);
3476 *outptr += repsize;
3477 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 /* we made it! */
3480 res = 0;
3481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 Py_XDECREF(restuple);
3484 return res;
3485}
3486
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003487/* --- UTF-7 Codec -------------------------------------------------------- */
3488
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489/* See RFC2152 for details. We encode conservatively and decode liberally. */
3490
3491/* Three simple macros defining base-64. */
3492
3493/* Is c a base-64 character? */
3494
3495#define IS_BASE64(c) \
3496 (((c) >= 'A' && (c) <= 'Z') || \
3497 ((c) >= 'a' && (c) <= 'z') || \
3498 ((c) >= '0' && (c) <= '9') || \
3499 (c) == '+' || (c) == '/')
3500
3501/* given that c is a base-64 character, what is its base-64 value? */
3502
3503#define FROM_BASE64(c) \
3504 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3505 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3506 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3507 (c) == '+' ? 62 : 63)
3508
3509/* What is the base-64 character of the bottom 6 bits of n? */
3510
3511#define TO_BASE64(n) \
3512 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3513
3514/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3515 * decoded as itself. We are permissive on decoding; the only ASCII
3516 * byte not decoding to itself is the + which begins a base64
3517 * string. */
3518
3519#define DECODE_DIRECT(c) \
3520 ((c) <= 127 && (c) != '+')
3521
3522/* The UTF-7 encoder treats ASCII characters differently according to
3523 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3524 * the above). See RFC2152. This array identifies these different
3525 * sets:
3526 * 0 : "Set D"
3527 * alphanumeric and '(),-./:?
3528 * 1 : "Set O"
3529 * !"#$%&*;<=>@[]^_`{|}
3530 * 2 : "whitespace"
3531 * ht nl cr sp
3532 * 3 : special (must be base64 encoded)
3533 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3534 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003535
Tim Petersced69f82003-09-16 20:30:58 +00003536static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003537char utf7_category[128] = {
3538/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3539 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3540/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3541 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3542/* sp ! " # $ % & ' ( ) * + , - . / */
3543 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3544/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3546/* @ A B C D E F G H I J K L M N O */
3547 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3548/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3550/* ` a b c d e f g h i j k l m n o */
3551 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3552/* p q r s t u v w x y z { | } ~ del */
3553 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554};
3555
Antoine Pitrou244651a2009-05-04 18:56:13 +00003556/* ENCODE_DIRECT: this character should be encoded as itself. The
3557 * answer depends on whether we are encoding set O as itself, and also
3558 * on whether we are encoding whitespace as itself. RFC2152 makes it
3559 * clear that the answers to these questions vary between
3560 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003561
Antoine Pitrou244651a2009-05-04 18:56:13 +00003562#define ENCODE_DIRECT(c, directO, directWS) \
3563 ((c) < 128 && (c) > 0 && \
3564 ((utf7_category[(c)] == 0) || \
3565 (directWS && (utf7_category[(c)] == 2)) || \
3566 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567
Alexander Belopolsky40018472011-02-26 01:02:56 +00003568PyObject *
3569PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003570 Py_ssize_t size,
3571 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003572{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003573 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3574}
3575
Antoine Pitrou244651a2009-05-04 18:56:13 +00003576/* The decoder. The only state we preserve is our read position,
3577 * i.e. how many characters we have consumed. So if we end in the
3578 * middle of a shift sequence we have to back off the read position
3579 * and the output to the beginning of the sequence, otherwise we lose
3580 * all the shift state (seen bits, number of bits seen, high
3581 * surrogate). */
3582
Alexander Belopolsky40018472011-02-26 01:02:56 +00003583PyObject *
3584PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003585 Py_ssize_t size,
3586 const char *errors,
3587 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t startinpos;
3591 Py_ssize_t endinpos;
3592 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003593 const char *e;
3594 PyUnicodeObject *unicode;
3595 Py_UNICODE *p;
3596 const char *errmsg = "";
3597 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003598 Py_UNICODE *shiftOutStart;
3599 unsigned int base64bits = 0;
3600 unsigned long base64buffer = 0;
3601 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 PyObject *errorHandler = NULL;
3603 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604
3605 unicode = _PyUnicode_New(size);
3606 if (!unicode)
3607 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003608 if (size == 0) {
3609 if (consumed)
3610 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003611 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003612 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003614 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 e = s + size;
3617
3618 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003621 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003622
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623 if (inShift) { /* in a base-64 section */
3624 if (IS_BASE64(ch)) { /* consume a base-64 character */
3625 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3626 base64bits += 6;
3627 s++;
3628 if (base64bits >= 16) {
3629 /* we have enough bits for a UTF-16 value */
3630 Py_UNICODE outCh = (Py_UNICODE)
3631 (base64buffer >> (base64bits-16));
3632 base64bits -= 16;
3633 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3634 if (surrogate) {
3635 /* expecting a second surrogate */
3636 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3637#ifdef Py_UNICODE_WIDE
3638 *p++ = (((surrogate & 0x3FF)<<10)
3639 | (outCh & 0x3FF)) + 0x10000;
3640#else
3641 *p++ = surrogate;
3642 *p++ = outCh;
3643#endif
3644 surrogate = 0;
3645 }
3646 else {
3647 surrogate = 0;
3648 errmsg = "second surrogate missing";
3649 goto utf7Error;
3650 }
3651 }
3652 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3653 /* first surrogate */
3654 surrogate = outCh;
3655 }
3656 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3657 errmsg = "unexpected second surrogate";
3658 goto utf7Error;
3659 }
3660 else {
3661 *p++ = outCh;
3662 }
3663 }
3664 }
3665 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003666 inShift = 0;
3667 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003668 if (surrogate) {
3669 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003670 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003671 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003672 if (base64bits > 0) { /* left-over bits */
3673 if (base64bits >= 6) {
3674 /* We've seen at least one base-64 character */
3675 errmsg = "partial character in shift sequence";
3676 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003677 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003678 else {
3679 /* Some bits remain; they should be zero */
3680 if (base64buffer != 0) {
3681 errmsg = "non-zero padding bits in shift sequence";
3682 goto utf7Error;
3683 }
3684 }
3685 }
3686 if (ch != '-') {
3687 /* '-' is absorbed; other terminating
3688 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 *p++ = ch;
3690 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691 }
3692 }
3693 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003695 s++; /* consume '+' */
3696 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003697 s++;
3698 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003699 }
3700 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003702 shiftOutStart = p;
3703 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003704 }
3705 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003706 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003707 *p++ = ch;
3708 s++;
3709 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003710 else {
3711 startinpos = s-starts;
3712 s++;
3713 errmsg = "unexpected special character";
3714 goto utf7Error;
3715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003716 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 outpos = p-PyUnicode_AS_UNICODE(unicode);
3719 endinpos = s-starts;
3720 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 errors, &errorHandler,
3722 "utf7", errmsg,
3723 &starts, &e, &startinpos, &endinpos, &exc, &s,
3724 &unicode, &outpos, &p))
3725 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003726 }
3727
Antoine Pitrou244651a2009-05-04 18:56:13 +00003728 /* end of string */
3729
3730 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3731 /* if we're in an inconsistent state, that's an error */
3732 if (surrogate ||
3733 (base64bits >= 6) ||
3734 (base64bits > 0 && base64buffer != 0)) {
3735 outpos = p-PyUnicode_AS_UNICODE(unicode);
3736 endinpos = size;
3737 if (unicode_decode_call_errorhandler(
3738 errors, &errorHandler,
3739 "utf7", "unterminated shift sequence",
3740 &starts, &e, &startinpos, &endinpos, &exc, &s,
3741 &unicode, &outpos, &p))
3742 goto onError;
3743 if (s < e)
3744 goto restart;
3745 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003746 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003747
3748 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003749 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003750 if (inShift) {
3751 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003752 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753 }
3754 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003755 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003756 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003757 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758
Victor Stinnerfe226c02011-10-03 03:52:20 +02003759 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003760 goto onError;
3761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 Py_XDECREF(errorHandler);
3763 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003764#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003765 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 Py_DECREF(unicode);
3767 return NULL;
3768 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003769#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003770 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771 return (PyObject *)unicode;
3772
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 Py_XDECREF(errorHandler);
3775 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003776 Py_DECREF(unicode);
3777 return NULL;
3778}
3779
3780
Alexander Belopolsky40018472011-02-26 01:02:56 +00003781PyObject *
3782PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003783 Py_ssize_t size,
3784 int base64SetO,
3785 int base64WhiteSpace,
3786 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003788 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003790 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003791 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003792 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003793 unsigned int base64bits = 0;
3794 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795 char * out;
3796 char * start;
3797
3798 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003801 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003802 return PyErr_NoMemory();
3803
Antoine Pitrou244651a2009-05-04 18:56:13 +00003804 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003805 if (v == NULL)
3806 return NULL;
3807
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003808 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809 for (;i < size; ++i) {
3810 Py_UNICODE ch = s[i];
3811
Antoine Pitrou244651a2009-05-04 18:56:13 +00003812 if (inShift) {
3813 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3814 /* shifting out */
3815 if (base64bits) { /* output remaining bits */
3816 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3817 base64buffer = 0;
3818 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819 }
3820 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003821 /* Characters not in the BASE64 set implicitly unshift the sequence
3822 so no '-' is required, except if the character is itself a '-' */
3823 if (IS_BASE64(ch) || ch == '-') {
3824 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003826 *out++ = (char) ch;
3827 }
3828 else {
3829 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003830 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003831 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003832 else { /* not in a shift sequence */
3833 if (ch == '+') {
3834 *out++ = '+';
3835 *out++ = '-';
3836 }
3837 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3838 *out++ = (char) ch;
3839 }
3840 else {
3841 *out++ = '+';
3842 inShift = 1;
3843 goto encode_char;
3844 }
3845 }
3846 continue;
3847encode_char:
3848#ifdef Py_UNICODE_WIDE
3849 if (ch >= 0x10000) {
3850 /* code first surrogate */
3851 base64bits += 16;
3852 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3853 while (base64bits >= 6) {
3854 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3855 base64bits -= 6;
3856 }
3857 /* prepare second surrogate */
3858 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3859 }
3860#endif
3861 base64bits += 16;
3862 base64buffer = (base64buffer << 16) | ch;
3863 while (base64bits >= 6) {
3864 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3865 base64bits -= 6;
3866 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003867 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 if (base64bits)
3869 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3870 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003871 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003872 if (_PyBytes_Resize(&v, out - start) < 0)
3873 return NULL;
3874 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875}
3876
Antoine Pitrou244651a2009-05-04 18:56:13 +00003877#undef IS_BASE64
3878#undef FROM_BASE64
3879#undef TO_BASE64
3880#undef DECODE_DIRECT
3881#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003882
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883/* --- UTF-8 Codec -------------------------------------------------------- */
3884
Tim Petersced69f82003-09-16 20:30:58 +00003885static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003887 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3888 illegal prefix. See RFC 3629 for details */
3889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3895 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003896 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3901 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3902 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3903 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3904 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905};
3906
Alexander Belopolsky40018472011-02-26 01:02:56 +00003907PyObject *
3908PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003909 Py_ssize_t size,
3910 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911{
Walter Dörwald69652032004-09-07 20:24:22 +00003912 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3913}
3914
Antoine Pitrouab868312009-01-10 15:40:25 +00003915/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3916#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3917
3918/* Mask to quickly check whether a C 'long' contains a
3919 non-ASCII, UTF8-encoded char. */
3920#if (SIZEOF_LONG == 8)
3921# define ASCII_CHAR_MASK 0x8080808080808080L
3922#elif (SIZEOF_LONG == 4)
3923# define ASCII_CHAR_MASK 0x80808080L
3924#else
3925# error C 'long' size should be either 4 or 8!
3926#endif
3927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928/* Scans a UTF-8 string and returns the maximum character to be expected,
3929 the size of the decoded unicode string and if any major errors were
3930 encountered.
3931
3932 This function does check basic UTF-8 sanity, it does however NOT CHECK
3933 if the string contains surrogates, and if all continuation bytes are
3934 within the correct ranges, these checks are performed in
3935 PyUnicode_DecodeUTF8Stateful.
3936
3937 If it sets has_errors to 1, it means the value of unicode_size and max_char
3938 will be bogus and you should not rely on useful information in them.
3939 */
3940static Py_UCS4
3941utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3942 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3943 int *has_errors)
3944{
3945 Py_ssize_t n;
3946 Py_ssize_t char_count = 0;
3947 Py_UCS4 max_char = 127, new_max;
3948 Py_UCS4 upper_bound;
3949 const unsigned char *p = (const unsigned char *)s;
3950 const unsigned char *end = p + string_size;
3951 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3952 int err = 0;
3953
3954 for (; p < end && !err; ++p, ++char_count) {
3955 /* Only check value if it's not a ASCII char... */
3956 if (*p < 0x80) {
3957 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3958 an explanation. */
3959 if (!((size_t) p & LONG_PTR_MASK)) {
3960 /* Help register allocation */
3961 register const unsigned char *_p = p;
3962 while (_p < aligned_end) {
3963 unsigned long value = *(unsigned long *) _p;
3964 if (value & ASCII_CHAR_MASK)
3965 break;
3966 _p += SIZEOF_LONG;
3967 char_count += SIZEOF_LONG;
3968 }
3969 p = _p;
3970 if (p == end)
3971 break;
3972 }
3973 }
3974 if (*p >= 0x80) {
3975 n = utf8_code_length[*p];
3976 new_max = max_char;
3977 switch (n) {
3978 /* invalid start byte */
3979 case 0:
3980 err = 1;
3981 break;
3982 case 2:
3983 /* Code points between 0x00FF and 0x07FF inclusive.
3984 Approximate the upper bound of the code point,
3985 if this flips over 255 we can be sure it will be more
3986 than 255 and the string will need 2 bytes per code coint,
3987 if it stays under or equal to 255, we can be sure 1 byte
3988 is enough.
3989 ((*p & 0b00011111) << 6) | 0b00111111 */
3990 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3991 if (max_char < upper_bound)
3992 new_max = upper_bound;
3993 /* Ensure we track at least that we left ASCII space. */
3994 if (new_max < 128)
3995 new_max = 128;
3996 break;
3997 case 3:
3998 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3999 always > 255 and <= 65535 and will always need 2 bytes. */
4000 if (max_char < 65535)
4001 new_max = 65535;
4002 break;
4003 case 4:
4004 /* Code point will be above 0xFFFF for sure in this case. */
4005 new_max = 65537;
4006 break;
4007 /* Internal error, this should be caught by the first if */
4008 case 1:
4009 default:
4010 assert(0 && "Impossible case in utf8_max_char_and_size");
4011 err = 1;
4012 }
4013 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004014 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 --n;
4016 /* Check if the follow up chars are all valid continuation bytes */
4017 if (n >= 1) {
4018 const unsigned char *cont;
4019 if ((p + n) >= end) {
4020 if (consumed == 0)
4021 /* incomplete data, non-incremental decoding */
4022 err = 1;
4023 break;
4024 }
4025 for (cont = p + 1; cont < (p + n); ++cont) {
4026 if ((*cont & 0xc0) != 0x80) {
4027 err = 1;
4028 break;
4029 }
4030 }
4031 p += n;
4032 }
4033 else
4034 err = 1;
4035 max_char = new_max;
4036 }
4037 }
4038
4039 if (unicode_size)
4040 *unicode_size = char_count;
4041 if (has_errors)
4042 *has_errors = err;
4043 return max_char;
4044}
4045
4046/* Similar to PyUnicode_WRITE but can also write into wstr field
4047 of the legacy unicode representation */
4048#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4049 do { \
4050 const int k_ = (kind); \
4051 if (k_ == PyUnicode_WCHAR_KIND) \
4052 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4053 else if (k_ == PyUnicode_1BYTE_KIND) \
4054 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4055 else if (k_ == PyUnicode_2BYTE_KIND) \
4056 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4057 else \
4058 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4059 } while (0)
4060
Alexander Belopolsky40018472011-02-26 01:02:56 +00004061PyObject *
4062PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 Py_ssize_t size,
4064 const char *errors,
4065 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004066{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004069 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070 Py_ssize_t startinpos;
4071 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004072 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004074 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 PyObject *errorHandler = NULL;
4076 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 Py_UCS4 maxchar = 0;
4078 Py_ssize_t unicode_size;
4079 Py_ssize_t i;
4080 int kind;
4081 void *data;
4082 int has_errors;
4083 Py_UNICODE *error_outptr;
4084#if SIZEOF_WCHAR_T == 2
4085 Py_ssize_t wchar_offset = 0;
4086#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Walter Dörwald69652032004-09-07 20:24:22 +00004088 if (size == 0) {
4089 if (consumed)
4090 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4094 consumed, &has_errors);
4095 if (has_errors) {
4096 unicode = _PyUnicode_New(size);
4097 if (!unicode)
4098 return NULL;
4099 kind = PyUnicode_WCHAR_KIND;
4100 data = PyUnicode_AS_UNICODE(unicode);
4101 assert(data != NULL);
4102 }
4103 else {
4104 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4105 if (!unicode)
4106 return NULL;
4107 /* When the string is ASCII only, just use memcpy and return.
4108 unicode_size may be != size if there is an incomplete UTF-8
4109 sequence at the end of the ASCII block. */
4110 if (maxchar < 128 && size == unicode_size) {
4111 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4112 return (PyObject *)unicode;
4113 }
4114 kind = PyUnicode_KIND(unicode);
4115 data = PyUnicode_DATA(unicode);
4116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004120 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121
4122 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004123 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124
4125 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004126 /* Fast path for runs of ASCII characters. Given that common UTF-8
4127 input will consist of an overwhelming majority of ASCII
4128 characters, we try to optimize for this case by checking
4129 as many characters as a C 'long' can contain.
4130 First, check if we can do an aligned read, as most CPUs have
4131 a penalty for unaligned reads.
4132 */
4133 if (!((size_t) s & LONG_PTR_MASK)) {
4134 /* Help register allocation */
4135 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004137 while (_s < aligned_end) {
4138 /* Read a whole long at a time (either 4 or 8 bytes),
4139 and do a fast unrolled copy if it only contains ASCII
4140 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 unsigned long value = *(unsigned long *) _s;
4142 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004143 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4145 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4146 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4147 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004148#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4150 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4151 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4152 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004153#endif
4154 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004156 }
4157 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004159 if (s == e)
4160 break;
4161 ch = (unsigned char)*s;
4162 }
4163 }
4164
4165 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 s++;
4168 continue;
4169 }
4170
4171 n = utf8_code_length[ch];
4172
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004173 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 if (consumed)
4175 break;
4176 else {
4177 errmsg = "unexpected end of data";
4178 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 endinpos = startinpos+1;
4180 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4181 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 goto utf8Error;
4183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
4186 switch (n) {
4187
4188 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004189 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 startinpos = s-starts;
4191 endinpos = startinpos+1;
4192 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193
4194 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004195 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 startinpos = s-starts;
4197 endinpos = startinpos+1;
4198 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199
4200 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004201 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004202 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004204 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 goto utf8Error;
4206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004208 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 break;
4211
4212 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004213 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4214 will result in surrogates in range d800-dfff. Surrogates are
4215 not valid UTF-8 so they are rejected.
4216 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4217 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004218 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004219 (s[2] & 0xc0) != 0x80 ||
4220 ((unsigned char)s[0] == 0xE0 &&
4221 (unsigned char)s[1] < 0xA0) ||
4222 ((unsigned char)s[0] == 0xED &&
4223 (unsigned char)s[1] > 0x9F)) {
4224 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004226 endinpos = startinpos + 1;
4227
4228 /* if s[1] first two bits are 1 and 0, then the invalid
4229 continuation byte is s[2], so increment endinpos by 1,
4230 if not, s[1] is invalid and endinpos doesn't need to
4231 be incremented. */
4232 if ((s[1] & 0xC0) == 0x80)
4233 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 goto utf8Error;
4235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004237 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004239 break;
4240
4241 case 4:
4242 if ((s[1] & 0xc0) != 0x80 ||
4243 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004244 (s[3] & 0xc0) != 0x80 ||
4245 ((unsigned char)s[0] == 0xF0 &&
4246 (unsigned char)s[1] < 0x90) ||
4247 ((unsigned char)s[0] == 0xF4 &&
4248 (unsigned char)s[1] > 0x8F)) {
4249 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004251 endinpos = startinpos + 1;
4252 if ((s[1] & 0xC0) == 0x80) {
4253 endinpos++;
4254 if ((s[2] & 0xC0) == 0x80)
4255 endinpos++;
4256 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 goto utf8Error;
4258 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004259 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004260 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4261 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* If the string is flexible or we have native UCS-4, write
4264 directly.. */
4265 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4266 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 else {
4269 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 /* translate from 10000..10FFFF to 0..FFFF */
4272 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274 /* high surrogate = top 10 bits added to D800 */
4275 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4276 (Py_UNICODE)(0xD800 + (ch >> 10)));
4277
4278 /* low surrogate = bottom 10 bits added to DC00 */
4279 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4280 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4281 }
4282#if SIZEOF_WCHAR_T == 2
4283 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004284#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 }
4287 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004289
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 /* If this is not yet a resizable string, make it one.. */
4292 if (kind != PyUnicode_WCHAR_KIND) {
4293 const Py_UNICODE *u;
4294 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4295 if (!new_unicode)
4296 goto onError;
4297 u = PyUnicode_AsUnicode((PyObject *)unicode);
4298 if (!u)
4299 goto onError;
4300#if SIZEOF_WCHAR_T == 2
4301 i += wchar_offset;
4302#endif
4303 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4304 Py_DECREF(unicode);
4305 unicode = new_unicode;
4306 kind = 0;
4307 data = PyUnicode_AS_UNICODE(new_unicode);
4308 assert(data != NULL);
4309 }
4310 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 if (unicode_decode_call_errorhandler(
4312 errors, &errorHandler,
4313 "utf8", errmsg,
4314 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004315 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004317 /* Update data because unicode_decode_call_errorhandler might have
4318 re-created or resized the unicode object. */
4319 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322 /* Ensure the unicode_size calculation above was correct: */
4323 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4324
Walter Dörwald69652032004-09-07 20:24:22 +00004325 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 /* Adjust length and ready string when it contained errors and
4329 is of the old resizable kind. */
4330 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004331 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 goto onError;
4333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004337#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004338 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 Py_DECREF(unicode);
4340 return NULL;
4341 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004342#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004343 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 return (PyObject *)unicode;
4345
Benjamin Peterson29060642009-01-31 22:14:21 +00004346 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 Py_XDECREF(errorHandler);
4348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 Py_DECREF(unicode);
4350 return NULL;
4351}
4352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004354
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004355#ifdef __APPLE__
4356
4357/* Simplified UTF-8 decoder using surrogateescape error handler,
4358 used to decode the command line arguments on Mac OS X. */
4359
4360wchar_t*
4361_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4362{
4363 int n;
4364 const char *e;
4365 wchar_t *unicode, *p;
4366
4367 /* Note: size will always be longer than the resulting Unicode
4368 character count */
4369 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4370 PyErr_NoMemory();
4371 return NULL;
4372 }
4373 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4374 if (!unicode)
4375 return NULL;
4376
4377 /* Unpack UTF-8 encoded data */
4378 p = unicode;
4379 e = s + size;
4380 while (s < e) {
4381 Py_UCS4 ch = (unsigned char)*s;
4382
4383 if (ch < 0x80) {
4384 *p++ = (wchar_t)ch;
4385 s++;
4386 continue;
4387 }
4388
4389 n = utf8_code_length[ch];
4390 if (s + n > e) {
4391 goto surrogateescape;
4392 }
4393
4394 switch (n) {
4395 case 0:
4396 case 1:
4397 goto surrogateescape;
4398
4399 case 2:
4400 if ((s[1] & 0xc0) != 0x80)
4401 goto surrogateescape;
4402 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4403 assert ((ch > 0x007F) && (ch <= 0x07FF));
4404 *p++ = (wchar_t)ch;
4405 break;
4406
4407 case 3:
4408 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4409 will result in surrogates in range d800-dfff. Surrogates are
4410 not valid UTF-8 so they are rejected.
4411 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4412 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4413 if ((s[1] & 0xc0) != 0x80 ||
4414 (s[2] & 0xc0) != 0x80 ||
4415 ((unsigned char)s[0] == 0xE0 &&
4416 (unsigned char)s[1] < 0xA0) ||
4417 ((unsigned char)s[0] == 0xED &&
4418 (unsigned char)s[1] > 0x9F)) {
4419
4420 goto surrogateescape;
4421 }
4422 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4423 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004424 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004425 break;
4426
4427 case 4:
4428 if ((s[1] & 0xc0) != 0x80 ||
4429 (s[2] & 0xc0) != 0x80 ||
4430 (s[3] & 0xc0) != 0x80 ||
4431 ((unsigned char)s[0] == 0xF0 &&
4432 (unsigned char)s[1] < 0x90) ||
4433 ((unsigned char)s[0] == 0xF4 &&
4434 (unsigned char)s[1] > 0x8F)) {
4435 goto surrogateescape;
4436 }
4437 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4438 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4439 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4440
4441#if SIZEOF_WCHAR_T == 4
4442 *p++ = (wchar_t)ch;
4443#else
4444 /* compute and append the two surrogates: */
4445
4446 /* translate from 10000..10FFFF to 0..FFFF */
4447 ch -= 0x10000;
4448
4449 /* high surrogate = top 10 bits added to D800 */
4450 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4451
4452 /* low surrogate = bottom 10 bits added to DC00 */
4453 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4454#endif
4455 break;
4456 }
4457 s += n;
4458 continue;
4459
4460 surrogateescape:
4461 *p++ = 0xDC00 + ch;
4462 s++;
4463 }
4464 *p = L'\0';
4465 return unicode;
4466}
4467
4468#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470/* Primary internal function which creates utf8 encoded bytes objects.
4471
4472 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004473 and allocate exactly as much space needed at the end. Else allocate the
4474 maximum possible needed (4 result bytes per Unicode character), and return
4475 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004476*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004477PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479{
Tim Peters602f7402002-04-27 18:03:26 +00004480#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004481
Guido van Rossum98297ee2007-11-06 21:34:58 +00004482 Py_ssize_t i; /* index into s of next input byte */
4483 PyObject *result; /* result string object */
4484 char *p; /* next free byte in output buffer */
4485 Py_ssize_t nallocated; /* number of result bytes allocated */
4486 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004487 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004488 PyObject *errorHandler = NULL;
4489 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004490 int kind;
4491 void *data;
4492 Py_ssize_t size;
4493 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4494#if SIZEOF_WCHAR_T == 2
4495 Py_ssize_t wchar_offset = 0;
4496#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004498 if (!PyUnicode_Check(unicode)) {
4499 PyErr_BadArgument();
4500 return NULL;
4501 }
4502
4503 if (PyUnicode_READY(unicode) == -1)
4504 return NULL;
4505
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004506 if (PyUnicode_UTF8(unicode))
4507 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4508 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509
4510 kind = PyUnicode_KIND(unicode);
4511 data = PyUnicode_DATA(unicode);
4512 size = PyUnicode_GET_LENGTH(unicode);
4513
Tim Peters602f7402002-04-27 18:03:26 +00004514 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515
Tim Peters602f7402002-04-27 18:03:26 +00004516 if (size <= MAX_SHORT_UNICHARS) {
4517 /* Write into the stack buffer; nallocated can't overflow.
4518 * At the end, we'll allocate exactly as much heap space as it
4519 * turns out we need.
4520 */
4521 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004522 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004523 p = stackbuf;
4524 }
4525 else {
4526 /* Overallocate on the heap, and give the excess back at the end. */
4527 nallocated = size * 4;
4528 if (nallocated / 4 != size) /* overflow! */
4529 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004530 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004531 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004532 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004533 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004534 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004535
Tim Peters602f7402002-04-27 18:03:26 +00004536 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004538
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004539 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004540 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004542
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004544 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004545 *p++ = (char)(0xc0 | (ch >> 6));
4546 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004547 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004548 Py_ssize_t newpos;
4549 PyObject *rep;
4550 Py_ssize_t repsize, k, startpos;
4551 startpos = i-1;
4552#if SIZEOF_WCHAR_T == 2
4553 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004554#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004555 rep = unicode_encode_call_errorhandler(
4556 errors, &errorHandler, "utf-8", "surrogates not allowed",
4557 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4558 &exc, startpos, startpos+1, &newpos);
4559 if (!rep)
4560 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004562 if (PyBytes_Check(rep))
4563 repsize = PyBytes_GET_SIZE(rep);
4564 else
4565 repsize = PyUnicode_GET_SIZE(rep);
4566
4567 if (repsize > 4) {
4568 Py_ssize_t offset;
4569
4570 if (result == NULL)
4571 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004572 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004573 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004575 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4576 /* integer overflow */
4577 PyErr_NoMemory();
4578 goto error;
4579 }
4580 nallocated += repsize - 4;
4581 if (result != NULL) {
4582 if (_PyBytes_Resize(&result, nallocated) < 0)
4583 goto error;
4584 } else {
4585 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004586 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004587 goto error;
4588 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4589 }
4590 p = PyBytes_AS_STRING(result) + offset;
4591 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004593 if (PyBytes_Check(rep)) {
4594 char *prep = PyBytes_AS_STRING(rep);
4595 for(k = repsize; k > 0; k--)
4596 *p++ = *prep++;
4597 } else /* rep is unicode */ {
4598 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4599 Py_UNICODE c;
4600
4601 for(k=0; k<repsize; k++) {
4602 c = prep[k];
4603 if (0x80 <= c) {
4604 raise_encode_exception(&exc, "utf-8",
4605 PyUnicode_AS_UNICODE(unicode),
4606 size, i-1, i,
4607 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004608 goto error;
4609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004610 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004611 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004614 } else if (ch < 0x10000) {
4615 *p++ = (char)(0xe0 | (ch >> 12));
4616 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4617 *p++ = (char)(0x80 | (ch & 0x3f));
4618 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004619 /* Encode UCS4 Unicode ordinals */
4620 *p++ = (char)(0xf0 | (ch >> 18));
4621 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4622 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4623 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624#if SIZEOF_WCHAR_T == 2
4625 wchar_offset++;
4626#endif
Tim Peters602f7402002-04-27 18:03:26 +00004627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004629
Guido van Rossum98297ee2007-11-06 21:34:58 +00004630 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004631 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004632 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004633 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004634 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004635 }
4636 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004637 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004638 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004639 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004640 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004642
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004643 Py_XDECREF(errorHandler);
4644 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004645 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004646 error:
4647 Py_XDECREF(errorHandler);
4648 Py_XDECREF(exc);
4649 Py_XDECREF(result);
4650 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004651
Tim Peters602f7402002-04-27 18:03:26 +00004652#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653}
4654
Alexander Belopolsky40018472011-02-26 01:02:56 +00004655PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4657 Py_ssize_t size,
4658 const char *errors)
4659{
4660 PyObject *v, *unicode;
4661
4662 unicode = PyUnicode_FromUnicode(s, size);
4663 if (unicode == NULL)
4664 return NULL;
4665 v = _PyUnicode_AsUTF8String(unicode, errors);
4666 Py_DECREF(unicode);
4667 return v;
4668}
4669
4670PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004671PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674}
4675
Walter Dörwald41980ca2007-08-16 21:55:45 +00004676/* --- UTF-32 Codec ------------------------------------------------------- */
4677
4678PyObject *
4679PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 Py_ssize_t size,
4681 const char *errors,
4682 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004683{
4684 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4685}
4686
4687PyObject *
4688PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 Py_ssize_t size,
4690 const char *errors,
4691 int *byteorder,
4692 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004693{
4694 const char *starts = s;
4695 Py_ssize_t startinpos;
4696 Py_ssize_t endinpos;
4697 Py_ssize_t outpos;
4698 PyUnicodeObject *unicode;
4699 Py_UNICODE *p;
4700#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004701 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004702 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004703#else
4704 const int pairs = 0;
4705#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004706 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004707 int bo = 0; /* assume native ordering by default */
4708 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004709 /* Offsets from q for retrieving bytes in the right order. */
4710#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4711 int iorder[] = {0, 1, 2, 3};
4712#else
4713 int iorder[] = {3, 2, 1, 0};
4714#endif
4715 PyObject *errorHandler = NULL;
4716 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004717
Walter Dörwald41980ca2007-08-16 21:55:45 +00004718 q = (unsigned char *)s;
4719 e = q + size;
4720
4721 if (byteorder)
4722 bo = *byteorder;
4723
4724 /* Check for BOM marks (U+FEFF) in the input and adjust current
4725 byte order setting accordingly. In native mode, the leading BOM
4726 mark is skipped, in all other modes, it is copied to the output
4727 stream as-is (giving a ZWNBSP character). */
4728 if (bo == 0) {
4729 if (size >= 4) {
4730 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004732#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 if (bom == 0x0000FEFF) {
4734 q += 4;
4735 bo = -1;
4736 }
4737 else if (bom == 0xFFFE0000) {
4738 q += 4;
4739 bo = 1;
4740 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004741#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 if (bom == 0x0000FEFF) {
4743 q += 4;
4744 bo = 1;
4745 }
4746 else if (bom == 0xFFFE0000) {
4747 q += 4;
4748 bo = -1;
4749 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004750#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752 }
4753
4754 if (bo == -1) {
4755 /* force LE */
4756 iorder[0] = 0;
4757 iorder[1] = 1;
4758 iorder[2] = 2;
4759 iorder[3] = 3;
4760 }
4761 else if (bo == 1) {
4762 /* force BE */
4763 iorder[0] = 3;
4764 iorder[1] = 2;
4765 iorder[2] = 1;
4766 iorder[3] = 0;
4767 }
4768
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004769 /* On narrow builds we split characters outside the BMP into two
4770 codepoints => count how much extra space we need. */
4771#ifndef Py_UNICODE_WIDE
4772 for (qq = q; qq < e; qq += 4)
4773 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4774 pairs++;
4775#endif
4776
4777 /* This might be one to much, because of a BOM */
4778 unicode = _PyUnicode_New((size+3)/4+pairs);
4779 if (!unicode)
4780 return NULL;
4781 if (size == 0)
4782 return (PyObject *)unicode;
4783
4784 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004786
Walter Dörwald41980ca2007-08-16 21:55:45 +00004787 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 Py_UCS4 ch;
4789 /* remaining bytes at the end? (size should be divisible by 4) */
4790 if (e-q<4) {
4791 if (consumed)
4792 break;
4793 errmsg = "truncated data";
4794 startinpos = ((const char *)q)-starts;
4795 endinpos = ((const char *)e)-starts;
4796 goto utf32Error;
4797 /* The remaining input chars are ignored if the callback
4798 chooses to skip the input */
4799 }
4800 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4801 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (ch >= 0x110000)
4804 {
4805 errmsg = "codepoint not in range(0x110000)";
4806 startinpos = ((const char *)q)-starts;
4807 endinpos = startinpos+4;
4808 goto utf32Error;
4809 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004810#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 if (ch >= 0x10000)
4812 {
4813 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4814 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4815 }
4816 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004817#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 *p++ = ch;
4819 q += 4;
4820 continue;
4821 utf32Error:
4822 outpos = p-PyUnicode_AS_UNICODE(unicode);
4823 if (unicode_decode_call_errorhandler(
4824 errors, &errorHandler,
4825 "utf32", errmsg,
4826 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4827 &unicode, &outpos, &p))
4828 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004829 }
4830
4831 if (byteorder)
4832 *byteorder = bo;
4833
4834 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836
4837 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004838 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004839 goto onError;
4840
4841 Py_XDECREF(errorHandler);
4842 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004843#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004844 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 Py_DECREF(unicode);
4846 return NULL;
4847 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004848#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004849 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850 return (PyObject *)unicode;
4851
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004853 Py_DECREF(unicode);
4854 Py_XDECREF(errorHandler);
4855 Py_XDECREF(exc);
4856 return NULL;
4857}
4858
4859PyObject *
4860PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_ssize_t size,
4862 const char *errors,
4863 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004865 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004867 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004869 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870#else
4871 const int pairs = 0;
4872#endif
4873 /* Offsets from p for storing byte pairs in the right order. */
4874#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4875 int iorder[] = {0, 1, 2, 3};
4876#else
4877 int iorder[] = {3, 2, 1, 0};
4878#endif
4879
Benjamin Peterson29060642009-01-31 22:14:21 +00004880#define STORECHAR(CH) \
4881 do { \
4882 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4883 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4884 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4885 p[iorder[0]] = (CH) & 0xff; \
4886 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004887 } while(0)
4888
4889 /* In narrow builds we can output surrogate pairs as one codepoint,
4890 so we need less space. */
4891#ifndef Py_UNICODE_WIDE
4892 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4894 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4895 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004897 nsize = (size - pairs + (byteorder == 0));
4898 bytesize = nsize * 4;
4899 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004901 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902 if (v == NULL)
4903 return NULL;
4904
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004905 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004908 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004909 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910
4911 if (byteorder == -1) {
4912 /* force LE */
4913 iorder[0] = 0;
4914 iorder[1] = 1;
4915 iorder[2] = 2;
4916 iorder[3] = 3;
4917 }
4918 else if (byteorder == 1) {
4919 /* force BE */
4920 iorder[0] = 3;
4921 iorder[1] = 2;
4922 iorder[2] = 1;
4923 iorder[3] = 0;
4924 }
4925
4926 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4930 Py_UCS4 ch2 = *s;
4931 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4932 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4933 s++;
4934 size--;
4935 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004936 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937#endif
4938 STORECHAR(ch);
4939 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004940
4941 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004942 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943#undef STORECHAR
4944}
4945
Alexander Belopolsky40018472011-02-26 01:02:56 +00004946PyObject *
4947PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948{
4949 if (!PyUnicode_Check(unicode)) {
4950 PyErr_BadArgument();
4951 return NULL;
4952 }
4953 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 PyUnicode_GET_SIZE(unicode),
4955 NULL,
4956 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957}
4958
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959/* --- UTF-16 Codec ------------------------------------------------------- */
4960
Tim Peters772747b2001-08-09 22:21:55 +00004961PyObject *
4962PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_ssize_t size,
4964 const char *errors,
4965 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966{
Walter Dörwald69652032004-09-07 20:24:22 +00004967 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4968}
4969
Antoine Pitrouab868312009-01-10 15:40:25 +00004970/* Two masks for fast checking of whether a C 'long' may contain
4971 UTF16-encoded surrogate characters. This is an efficient heuristic,
4972 assuming that non-surrogate characters with a code point >= 0x8000 are
4973 rare in most input.
4974 FAST_CHAR_MASK is used when the input is in native byte ordering,
4975 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004976*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004977#if (SIZEOF_LONG == 8)
4978# define FAST_CHAR_MASK 0x8000800080008000L
4979# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4980#elif (SIZEOF_LONG == 4)
4981# define FAST_CHAR_MASK 0x80008000L
4982# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4983#else
4984# error C 'long' size should be either 4 or 8!
4985#endif
4986
Walter Dörwald69652032004-09-07 20:24:22 +00004987PyObject *
4988PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 Py_ssize_t size,
4990 const char *errors,
4991 int *byteorder,
4992 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004993{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004995 Py_ssize_t startinpos;
4996 Py_ssize_t endinpos;
4997 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 PyUnicodeObject *unicode;
4999 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005000 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005001 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005002 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005003 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005004 /* Offsets from q for retrieving byte pairs in the right order. */
5005#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5006 int ihi = 1, ilo = 0;
5007#else
5008 int ihi = 0, ilo = 1;
5009#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 PyObject *errorHandler = NULL;
5011 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
5013 /* Note: size will always be longer than the resulting Unicode
5014 character count */
5015 unicode = _PyUnicode_New(size);
5016 if (!unicode)
5017 return NULL;
5018 if (size == 0)
5019 return (PyObject *)unicode;
5020
5021 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005022 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005023 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005024 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025
5026 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005027 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005029 /* Check for BOM marks (U+FEFF) in the input and adjust current
5030 byte order setting accordingly. In native mode, the leading BOM
5031 mark is skipped, in all other modes, it is copied to the output
5032 stream as-is (giving a ZWNBSP character). */
5033 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005034 if (size >= 2) {
5035 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005036#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 if (bom == 0xFEFF) {
5038 q += 2;
5039 bo = -1;
5040 }
5041 else if (bom == 0xFFFE) {
5042 q += 2;
5043 bo = 1;
5044 }
Tim Petersced69f82003-09-16 20:30:58 +00005045#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 if (bom == 0xFEFF) {
5047 q += 2;
5048 bo = 1;
5049 }
5050 else if (bom == 0xFFFE) {
5051 q += 2;
5052 bo = -1;
5053 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005054#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
Tim Peters772747b2001-08-09 22:21:55 +00005058 if (bo == -1) {
5059 /* force LE */
5060 ihi = 1;
5061 ilo = 0;
5062 }
5063 else if (bo == 1) {
5064 /* force BE */
5065 ihi = 0;
5066 ilo = 1;
5067 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005068#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5069 native_ordering = ilo < ihi;
5070#else
5071 native_ordering = ilo > ihi;
5072#endif
Tim Peters772747b2001-08-09 22:21:55 +00005073
Antoine Pitrouab868312009-01-10 15:40:25 +00005074 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005075 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005077 /* First check for possible aligned read of a C 'long'. Unaligned
5078 reads are more expensive, better to defer to another iteration. */
5079 if (!((size_t) q & LONG_PTR_MASK)) {
5080 /* Fast path for runs of non-surrogate chars. */
5081 register const unsigned char *_q = q;
5082 Py_UNICODE *_p = p;
5083 if (native_ordering) {
5084 /* Native ordering is simple: as long as the input cannot
5085 possibly contain a surrogate char, do an unrolled copy
5086 of several 16-bit code points to the target object.
5087 The non-surrogate check is done on several input bytes
5088 at a time (as many as a C 'long' can contain). */
5089 while (_q < aligned_end) {
5090 unsigned long data = * (unsigned long *) _q;
5091 if (data & FAST_CHAR_MASK)
5092 break;
5093 _p[0] = ((unsigned short *) _q)[0];
5094 _p[1] = ((unsigned short *) _q)[1];
5095#if (SIZEOF_LONG == 8)
5096 _p[2] = ((unsigned short *) _q)[2];
5097 _p[3] = ((unsigned short *) _q)[3];
5098#endif
5099 _q += SIZEOF_LONG;
5100 _p += SIZEOF_LONG / 2;
5101 }
5102 }
5103 else {
5104 /* Byteswapped ordering is similar, but we must decompose
5105 the copy bytewise, and take care of zero'ing out the
5106 upper bytes if the target object is in 32-bit units
5107 (that is, in UCS-4 builds). */
5108 while (_q < aligned_end) {
5109 unsigned long data = * (unsigned long *) _q;
5110 if (data & SWAPPED_FAST_CHAR_MASK)
5111 break;
5112 /* Zero upper bytes in UCS-4 builds */
5113#if (Py_UNICODE_SIZE > 2)
5114 _p[0] = 0;
5115 _p[1] = 0;
5116#if (SIZEOF_LONG == 8)
5117 _p[2] = 0;
5118 _p[3] = 0;
5119#endif
5120#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005121 /* Issue #4916; UCS-4 builds on big endian machines must
5122 fill the two last bytes of each 4-byte unit. */
5123#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5124# define OFF 2
5125#else
5126# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005127#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005128 ((unsigned char *) _p)[OFF + 1] = _q[0];
5129 ((unsigned char *) _p)[OFF + 0] = _q[1];
5130 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5131 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5132#if (SIZEOF_LONG == 8)
5133 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5134 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5135 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5136 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5137#endif
5138#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005139 _q += SIZEOF_LONG;
5140 _p += SIZEOF_LONG / 2;
5141 }
5142 }
5143 p = _p;
5144 q = _q;
5145 if (q >= e)
5146 break;
5147 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149
Benjamin Peterson14339b62009-01-31 16:36:08 +00005150 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005151
5152 if (ch < 0xD800 || ch > 0xDFFF) {
5153 *p++ = ch;
5154 continue;
5155 }
5156
5157 /* UTF-16 code pair: */
5158 if (q > e) {
5159 errmsg = "unexpected end of data";
5160 startinpos = (((const char *)q) - 2) - starts;
5161 endinpos = ((const char *)e) + 1 - starts;
5162 goto utf16Error;
5163 }
5164 if (0xD800 <= ch && ch <= 0xDBFF) {
5165 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5166 q += 2;
5167 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005168#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 *p++ = ch;
5170 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005171#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005173#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 continue;
5175 }
5176 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005177 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 startinpos = (((const char *)q)-4)-starts;
5179 endinpos = startinpos+2;
5180 goto utf16Error;
5181 }
5182
Benjamin Peterson14339b62009-01-31 16:36:08 +00005183 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 errmsg = "illegal encoding";
5185 startinpos = (((const char *)q)-2)-starts;
5186 endinpos = startinpos+2;
5187 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005188
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 utf16Error:
5190 outpos = p - PyUnicode_AS_UNICODE(unicode);
5191 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 errors,
5193 &errorHandler,
5194 "utf16", errmsg,
5195 &starts,
5196 (const char **)&e,
5197 &startinpos,
5198 &endinpos,
5199 &exc,
5200 (const char **)&q,
5201 &unicode,
5202 &outpos,
5203 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 /* remaining byte at the end? (size should be even) */
5207 if (e == q) {
5208 if (!consumed) {
5209 errmsg = "truncated data";
5210 startinpos = ((const char *)q) - starts;
5211 endinpos = ((const char *)e) + 1 - starts;
5212 outpos = p - PyUnicode_AS_UNICODE(unicode);
5213 if (unicode_decode_call_errorhandler(
5214 errors,
5215 &errorHandler,
5216 "utf16", errmsg,
5217 &starts,
5218 (const char **)&e,
5219 &startinpos,
5220 &endinpos,
5221 &exc,
5222 (const char **)&q,
5223 &unicode,
5224 &outpos,
5225 &p))
5226 goto onError;
5227 /* The remaining input chars are ignored if the callback
5228 chooses to skip the input */
5229 }
5230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
5232 if (byteorder)
5233 *byteorder = bo;
5234
Walter Dörwald69652032004-09-07 20:24:22 +00005235 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005237
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005239 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 goto onError;
5241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 Py_XDECREF(errorHandler);
5243 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005244#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005245 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005246 Py_DECREF(unicode);
5247 return NULL;
5248 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005249#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005250 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 return (PyObject *)unicode;
5252
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255 Py_XDECREF(errorHandler);
5256 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 return NULL;
5258}
5259
Antoine Pitrouab868312009-01-10 15:40:25 +00005260#undef FAST_CHAR_MASK
5261#undef SWAPPED_FAST_CHAR_MASK
5262
Tim Peters772747b2001-08-09 22:21:55 +00005263PyObject *
5264PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 Py_ssize_t size,
5266 const char *errors,
5267 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005269 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005270 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005271 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005272#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005273 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005274#else
5275 const int pairs = 0;
5276#endif
Tim Peters772747b2001-08-09 22:21:55 +00005277 /* Offsets from p for storing byte pairs in the right order. */
5278#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5279 int ihi = 1, ilo = 0;
5280#else
5281 int ihi = 0, ilo = 1;
5282#endif
5283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284#define STORECHAR(CH) \
5285 do { \
5286 p[ihi] = ((CH) >> 8) & 0xff; \
5287 p[ilo] = (CH) & 0xff; \
5288 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005289 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005291#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005292 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 if (s[i] >= 0x10000)
5294 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005295#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005296 /* 2 * (size + pairs + (byteorder == 0)) */
5297 if (size > PY_SSIZE_T_MAX ||
5298 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005300 nsize = size + pairs + (byteorder == 0);
5301 bytesize = nsize * 2;
5302 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005304 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 if (v == NULL)
5306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005308 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005311 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005312 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005313
5314 if (byteorder == -1) {
5315 /* force LE */
5316 ihi = 1;
5317 ilo = 0;
5318 }
5319 else if (byteorder == 1) {
5320 /* force BE */
5321 ihi = 0;
5322 ilo = 1;
5323 }
5324
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005325 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 Py_UNICODE ch = *s++;
5327 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005328#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 if (ch >= 0x10000) {
5330 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5331 ch = 0xD800 | ((ch-0x10000) >> 10);
5332 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005333#endif
Tim Peters772747b2001-08-09 22:21:55 +00005334 STORECHAR(ch);
5335 if (ch2)
5336 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005337 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005338
5339 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005340 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005341#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342}
5343
Alexander Belopolsky40018472011-02-26 01:02:56 +00005344PyObject *
5345PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
5347 if (!PyUnicode_Check(unicode)) {
5348 PyErr_BadArgument();
5349 return NULL;
5350 }
5351 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 PyUnicode_GET_SIZE(unicode),
5353 NULL,
5354 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355}
5356
5357/* --- Unicode Escape Codec ----------------------------------------------- */
5358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5360 if all the escapes in the string make it still a valid ASCII string.
5361 Returns -1 if any escapes were found which cause the string to
5362 pop out of ASCII range. Otherwise returns the length of the
5363 required buffer to hold the string.
5364 */
5365Py_ssize_t
5366length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5367{
5368 const unsigned char *p = (const unsigned char *)s;
5369 const unsigned char *end = p + size;
5370 Py_ssize_t length = 0;
5371
5372 if (size < 0)
5373 return -1;
5374
5375 for (; p < end; ++p) {
5376 if (*p > 127) {
5377 /* Non-ASCII */
5378 return -1;
5379 }
5380 else if (*p != '\\') {
5381 /* Normal character */
5382 ++length;
5383 }
5384 else {
5385 /* Backslash-escape, check next char */
5386 ++p;
5387 /* Escape sequence reaches till end of string or
5388 non-ASCII follow-up. */
5389 if (p >= end || *p > 127)
5390 return -1;
5391 switch (*p) {
5392 case '\n':
5393 /* backslash + \n result in zero characters */
5394 break;
5395 case '\\': case '\'': case '\"':
5396 case 'b': case 'f': case 't':
5397 case 'n': case 'r': case 'v': case 'a':
5398 ++length;
5399 break;
5400 case '0': case '1': case '2': case '3':
5401 case '4': case '5': case '6': case '7':
5402 case 'x': case 'u': case 'U': case 'N':
5403 /* these do not guarantee ASCII characters */
5404 return -1;
5405 default:
5406 /* count the backslash + the other character */
5407 length += 2;
5408 }
5409 }
5410 }
5411 return length;
5412}
5413
5414/* Similar to PyUnicode_WRITE but either write into wstr field
5415 or treat string as ASCII. */
5416#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5417 do { \
5418 if ((kind) != PyUnicode_WCHAR_KIND) \
5419 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5420 else \
5421 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5422 } while (0)
5423
5424#define WRITE_WSTR(buf, index, value) \
5425 assert(kind == PyUnicode_WCHAR_KIND), \
5426 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5427
5428
Fredrik Lundh06d12682001-01-24 07:59:11 +00005429static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005430
Alexander Belopolsky40018472011-02-26 01:02:56 +00005431PyObject *
5432PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005433 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005437 Py_ssize_t startinpos;
5438 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005443 char* message;
5444 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 PyObject *errorHandler = NULL;
5446 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 Py_ssize_t ascii_length;
5448 Py_ssize_t i;
5449 int kind;
5450 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 ascii_length = length_of_escaped_ascii_string(s, size);
5453
5454 /* After length_of_escaped_ascii_string() there are two alternatives,
5455 either the string is pure ASCII with named escapes like \n, etc.
5456 and we determined it's exact size (common case)
5457 or it contains \x, \u, ... escape sequences. then we create a
5458 legacy wchar string and resize it at the end of this function. */
5459 if (ascii_length >= 0) {
5460 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5461 if (!v)
5462 goto onError;
5463 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5464 kind = PyUnicode_1BYTE_KIND;
5465 data = PyUnicode_DATA(v);
5466 }
5467 else {
5468 /* Escaped strings will always be longer than the resulting
5469 Unicode string, so we start with size here and then reduce the
5470 length after conversion to the true value.
5471 (but if the error callback returns a long replacement string
5472 we'll have to allocate more space) */
5473 v = _PyUnicode_New(size);
5474 if (!v)
5475 goto onError;
5476 kind = PyUnicode_WCHAR_KIND;
5477 data = PyUnicode_AS_UNICODE(v);
5478 }
5479
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 if (size == 0)
5481 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005484
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 while (s < end) {
5486 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005487 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005490 if (kind == PyUnicode_WCHAR_KIND) {
5491 assert(i < _PyUnicode_WSTR_LENGTH(v));
5492 }
5493 else {
5494 /* The only case in which i == ascii_length is a backslash
5495 followed by a newline. */
5496 assert(i <= ascii_length);
5497 }
5498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 /* Non-escape characters are interpreted as Unicode ordinals */
5500 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 continue;
5503 }
5504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 /* \ - Escapes */
5507 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005508 c = *s++;
5509 if (s > end)
5510 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005511
5512 if (kind == PyUnicode_WCHAR_KIND) {
5513 assert(i < _PyUnicode_WSTR_LENGTH(v));
5514 }
5515 else {
5516 /* The only case in which i == ascii_length is a backslash
5517 followed by a newline. */
5518 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5519 }
5520
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005521 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5526 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5527 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5528 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5529 /* FF */
5530 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5531 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5532 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5533 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5534 /* VT */
5535 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5536 /* BEL, not classic C */
5537 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 case '0': case '1': case '2': case '3':
5541 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005542 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005543 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005544 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005545 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005546 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 break;
5550
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 /* hex escapes */
5552 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 digits = 2;
5555 message = "truncated \\xXX escape";
5556 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560 digits = 4;
5561 message = "truncated \\uXXXX escape";
5562 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005565 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005566 digits = 8;
5567 message = "truncated \\UXXXXXXXX escape";
5568 hexescape:
5569 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 if (s+digits>end) {
5572 endinpos = size;
5573 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 errors, &errorHandler,
5575 "unicodeescape", "end of string in escape sequence",
5576 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 goto nextByte;
5581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582 for (j = 0; j < digits; ++j) {
5583 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005584 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 endinpos = (s+j+1)-starts;
5586 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 errors, &errorHandler,
5589 "unicodeescape", message,
5590 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005592 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005595 }
5596 chr = (chr<<4) & ~0xF;
5597 if (c >= '0' && c <= '9')
5598 chr += c - '0';
5599 else if (c >= 'a' && c <= 'f')
5600 chr += 10 + c - 'a';
5601 else
5602 chr += 10 + c - 'A';
5603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005604 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005605 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 /* _decoding_error will have already written into the
5607 target buffer. */
5608 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005609 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005610 /* when we get here, chr is a 32-bit unicode character */
5611 if (chr <= 0xffff)
5612 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005614 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005615 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005616 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005617#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005619#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005620 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5622 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005623#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005624 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 errors, &errorHandler,
5629 "unicodeescape", "illegal Unicode character",
5630 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005632 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005634 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005635 break;
5636
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005638 case 'N':
5639 message = "malformed \\N character escape";
5640 if (ucnhash_CAPI == NULL) {
5641 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5643 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005644 if (ucnhash_CAPI == NULL)
5645 goto ucnhashError;
5646 }
5647 if (*s == '{') {
5648 const char *start = s+1;
5649 /* look for the closing brace */
5650 while (*s != '}' && s < end)
5651 s++;
5652 if (s > start && s < end && *s == '}') {
5653 /* found a name. look it up in the unicode database */
5654 message = "unknown Unicode character name";
5655 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5657 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658 goto store;
5659 }
5660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005662 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 errors, &errorHandler,
5665 "unicodeescape", message,
5666 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005668 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 break;
5671
5672 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005673 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 message = "\\ at end of string";
5676 s--;
5677 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 errors, &errorHandler,
5681 "unicodeescape", message,
5682 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005683 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005684 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005686 }
5687 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5689 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005690 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 /* Ensure the length prediction worked in case of ASCII strings */
5697 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5698
Victor Stinnerfe226c02011-10-03 03:52:20 +02005699 if (kind == PyUnicode_WCHAR_KIND)
5700 {
5701 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5702 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005703 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005704 Py_XDECREF(errorHandler);
5705 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005706#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005707 if (_PyUnicode_READY_REPLACE(&v)) {
5708 Py_DECREF(v);
5709 return NULL;
5710 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005711#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005712 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005714
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005716 PyErr_SetString(
5717 PyExc_UnicodeError,
5718 "\\N escapes not supported (can't load unicodedata module)"
5719 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005720 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005721 Py_XDECREF(errorHandler);
5722 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005723 return NULL;
5724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 Py_XDECREF(errorHandler);
5728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 return NULL;
5730}
5731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732#undef WRITE_ASCII_OR_WSTR
5733#undef WRITE_WSTR
5734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735/* Return a Unicode-Escape string version of the Unicode object.
5736
5737 If quotes is true, the string is enclosed in u"" or u'' quotes as
5738 appropriate.
5739
5740*/
5741
Walter Dörwald79e913e2007-05-12 11:08:06 +00005742static const char *hexdigits = "0123456789abcdef";
5743
Alexander Belopolsky40018472011-02-26 01:02:56 +00005744PyObject *
5745PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005746 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005748 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005751#ifdef Py_UNICODE_WIDE
5752 const Py_ssize_t expandsize = 10;
5753#else
5754 const Py_ssize_t expandsize = 6;
5755#endif
5756
Thomas Wouters89f507f2006-12-13 04:49:30 +00005757 /* XXX(nnorwitz): rather than over-allocating, it would be
5758 better to choose a different scheme. Perhaps scan the
5759 first N-chars of the string and allocate based on that size.
5760 */
5761 /* Initial allocation is based on the longest-possible unichr
5762 escape.
5763
5764 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5765 unichr, so in this case it's the longest unichr escape. In
5766 narrow (UTF-16) builds this is five chars per source unichr
5767 since there are two unichrs in the surrogate pair, so in narrow
5768 (UTF-16) builds it's not the longest unichr escape.
5769
5770 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5771 so in the narrow (UTF-16) build case it's the longest unichr
5772 escape.
5773 */
5774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005775 if (size == 0)
5776 return PyBytes_FromStringAndSize(NULL, 0);
5777
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005778 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005780
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005781 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 2
5783 + expandsize*size
5784 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 if (repr == NULL)
5786 return NULL;
5787
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005788 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 while (size-- > 0) {
5791 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005792
Walter Dörwald79e913e2007-05-12 11:08:06 +00005793 /* Escape backslashes */
5794 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 *p++ = '\\';
5796 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005797 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005798 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005799
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005800#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005801 /* Map 21-bit characters to '\U00xxxxxx' */
5802 else if (ch >= 0x10000) {
5803 *p++ = '\\';
5804 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005805 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5806 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5807 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5808 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5809 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5810 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5811 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5812 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005814 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005815#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5817 else if (ch >= 0xD800 && ch < 0xDC00) {
5818 Py_UNICODE ch2;
5819 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005820
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 ch2 = *s++;
5822 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005823 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5825 *p++ = '\\';
5826 *p++ = 'U';
5827 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5828 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5829 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5830 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5831 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5832 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5833 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5834 *p++ = hexdigits[ucs & 0x0000000F];
5835 continue;
5836 }
5837 /* Fall through: isolated surrogates are copied as-is */
5838 s--;
5839 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005840 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005841#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005842
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005844 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 *p++ = '\\';
5846 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005847 *p++ = hexdigits[(ch >> 12) & 0x000F];
5848 *p++ = hexdigits[(ch >> 8) & 0x000F];
5849 *p++ = hexdigits[(ch >> 4) & 0x000F];
5850 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005852
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005853 /* Map special whitespace to '\t', \n', '\r' */
5854 else if (ch == '\t') {
5855 *p++ = '\\';
5856 *p++ = 't';
5857 }
5858 else if (ch == '\n') {
5859 *p++ = '\\';
5860 *p++ = 'n';
5861 }
5862 else if (ch == '\r') {
5863 *p++ = '\\';
5864 *p++ = 'r';
5865 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005866
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005867 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005868 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005870 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005871 *p++ = hexdigits[(ch >> 4) & 0x000F];
5872 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005873 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005874
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 /* Copy everything else as-is */
5876 else
5877 *p++ = (char) ch;
5878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005880 assert(p - PyBytes_AS_STRING(repr) > 0);
5881 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5882 return NULL;
5883 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
Alexander Belopolsky40018472011-02-26 01:02:56 +00005886PyObject *
5887PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005889 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 if (!PyUnicode_Check(unicode)) {
5891 PyErr_BadArgument();
5892 return NULL;
5893 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005894 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5895 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005896 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897}
5898
5899/* --- Raw Unicode Escape Codec ------------------------------------------- */
5900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
5902PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005903 Py_ssize_t size,
5904 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005907 Py_ssize_t startinpos;
5908 Py_ssize_t endinpos;
5909 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 const char *end;
5913 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 PyObject *errorHandler = NULL;
5915 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 length after conversion to the true value. (But decoding error
5920 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 v = _PyUnicode_New(size);
5922 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 end = s + size;
5928 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 unsigned char c;
5930 Py_UCS4 x;
5931 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005932 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 /* Non-escape characters are interpreted as Unicode ordinals */
5935 if (*s != '\\') {
5936 *p++ = (unsigned char)*s++;
5937 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 startinpos = s-starts;
5940
5941 /* \u-escapes are only interpreted iff the number of leading
5942 backslashes if odd */
5943 bs = s;
5944 for (;s < end;) {
5945 if (*s != '\\')
5946 break;
5947 *p++ = (unsigned char)*s++;
5948 }
5949 if (((s - bs) & 1) == 0 ||
5950 s >= end ||
5951 (*s != 'u' && *s != 'U')) {
5952 continue;
5953 }
5954 p--;
5955 count = *s=='u' ? 4 : 8;
5956 s++;
5957
5958 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5959 outpos = p-PyUnicode_AS_UNICODE(v);
5960 for (x = 0, i = 0; i < count; ++i, ++s) {
5961 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005962 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 endinpos = s-starts;
5964 if (unicode_decode_call_errorhandler(
5965 errors, &errorHandler,
5966 "rawunicodeescape", "truncated \\uXXXX",
5967 &starts, &end, &startinpos, &endinpos, &exc, &s,
5968 &v, &outpos, &p))
5969 goto onError;
5970 goto nextByte;
5971 }
5972 x = (x<<4) & ~0xF;
5973 if (c >= '0' && c <= '9')
5974 x += c - '0';
5975 else if (c >= 'a' && c <= 'f')
5976 x += 10 + c - 'a';
5977 else
5978 x += 10 + c - 'A';
5979 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005980 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* UCS-2 character */
5982 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005983 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 /* UCS-4 character. Either store directly, or as
5985 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005986#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005988#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 x -= 0x10000L;
5990 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5991 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005992#endif
5993 } else {
5994 endinpos = s-starts;
5995 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005996 if (unicode_decode_call_errorhandler(
5997 errors, &errorHandler,
5998 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 &starts, &end, &startinpos, &endinpos, &exc, &s,
6000 &v, &outpos, &p))
6001 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 nextByte:
6004 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006006 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 Py_XDECREF(errorHandler);
6009 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006010#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006011 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006012 Py_DECREF(v);
6013 return NULL;
6014 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006015#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006016 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006018
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 Py_XDECREF(errorHandler);
6022 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return NULL;
6024}
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
6027PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006030 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 char *p;
6032 char *q;
6033
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006034#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006035 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006036#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006037 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006038#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006039
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006040 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006042
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006043 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 if (repr == NULL)
6045 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006046 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006047 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006049 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 while (size-- > 0) {
6051 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006052#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 /* Map 32-bit characters to '\Uxxxxxxxx' */
6054 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006055 *p++ = '\\';
6056 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006057 *p++ = hexdigits[(ch >> 28) & 0xf];
6058 *p++ = hexdigits[(ch >> 24) & 0xf];
6059 *p++ = hexdigits[(ch >> 20) & 0xf];
6060 *p++ = hexdigits[(ch >> 16) & 0xf];
6061 *p++ = hexdigits[(ch >> 12) & 0xf];
6062 *p++ = hexdigits[(ch >> 8) & 0xf];
6063 *p++ = hexdigits[(ch >> 4) & 0xf];
6064 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006065 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006066 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006067#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6069 if (ch >= 0xD800 && ch < 0xDC00) {
6070 Py_UNICODE ch2;
6071 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006072
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 ch2 = *s++;
6074 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006075 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6077 *p++ = '\\';
6078 *p++ = 'U';
6079 *p++ = hexdigits[(ucs >> 28) & 0xf];
6080 *p++ = hexdigits[(ucs >> 24) & 0xf];
6081 *p++ = hexdigits[(ucs >> 20) & 0xf];
6082 *p++ = hexdigits[(ucs >> 16) & 0xf];
6083 *p++ = hexdigits[(ucs >> 12) & 0xf];
6084 *p++ = hexdigits[(ucs >> 8) & 0xf];
6085 *p++ = hexdigits[(ucs >> 4) & 0xf];
6086 *p++ = hexdigits[ucs & 0xf];
6087 continue;
6088 }
6089 /* Fall through: isolated surrogates are copied as-is */
6090 s--;
6091 size++;
6092 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006093#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 /* Map 16-bit characters to '\uxxxx' */
6095 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 *p++ = '\\';
6097 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006098 *p++ = hexdigits[(ch >> 12) & 0xf];
6099 *p++ = hexdigits[(ch >> 8) & 0xf];
6100 *p++ = hexdigits[(ch >> 4) & 0xf];
6101 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 /* Copy everything else as-is */
6104 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 *p++ = (char) ch;
6106 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006107 size = p - q;
6108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006109 assert(size > 0);
6110 if (_PyBytes_Resize(&repr, size) < 0)
6111 return NULL;
6112 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113}
6114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
6116PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006118 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006120 PyErr_BadArgument();
6121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006123 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6124 PyUnicode_GET_SIZE(unicode));
6125
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006126 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127}
6128
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006129/* --- Unicode Internal Codec ------------------------------------------- */
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
6132_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006133 Py_ssize_t size,
6134 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006135{
6136 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t startinpos;
6138 Py_ssize_t endinpos;
6139 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006140 PyUnicodeObject *v;
6141 Py_UNICODE *p;
6142 const char *end;
6143 const char *reason;
6144 PyObject *errorHandler = NULL;
6145 PyObject *exc = NULL;
6146
Neal Norwitzd43069c2006-01-08 01:12:10 +00006147#ifdef Py_UNICODE_WIDE
6148 Py_UNICODE unimax = PyUnicode_GetMax();
6149#endif
6150
Thomas Wouters89f507f2006-12-13 04:49:30 +00006151 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006152 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6153 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006155 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6156 as string was created with the old API. */
6157 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006159 p = PyUnicode_AS_UNICODE(v);
6160 end = s + size;
6161
6162 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006164 /* We have to sanity check the raw data, otherwise doom looms for
6165 some malformed UCS-4 data. */
6166 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006167#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006168 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006169#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006170 end-s < Py_UNICODE_SIZE
6171 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006173 startinpos = s - starts;
6174 if (end-s < Py_UNICODE_SIZE) {
6175 endinpos = end-starts;
6176 reason = "truncated input";
6177 }
6178 else {
6179 endinpos = s - starts + Py_UNICODE_SIZE;
6180 reason = "illegal code point (> 0x10FFFF)";
6181 }
6182 outpos = p - PyUnicode_AS_UNICODE(v);
6183 if (unicode_decode_call_errorhandler(
6184 errors, &errorHandler,
6185 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006186 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006187 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006188 goto onError;
6189 }
6190 }
6191 else {
6192 p++;
6193 s += Py_UNICODE_SIZE;
6194 }
6195 }
6196
Victor Stinnerfe226c02011-10-03 03:52:20 +02006197 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006198 goto onError;
6199 Py_XDECREF(errorHandler);
6200 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006201#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006202 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006203 Py_DECREF(v);
6204 return NULL;
6205 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006206#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006207 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208 return (PyObject *)v;
6209
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006211 Py_XDECREF(v);
6212 Py_XDECREF(errorHandler);
6213 Py_XDECREF(exc);
6214 return NULL;
6215}
6216
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217/* --- Latin-1 Codec ------------------------------------------------------ */
6218
Alexander Belopolsky40018472011-02-26 01:02:56 +00006219PyObject *
6220PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006221 Py_ssize_t size,
6222 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006225 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226}
6227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006229static void
6230make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006231 const char *encoding,
6232 const Py_UNICODE *unicode, Py_ssize_t size,
6233 Py_ssize_t startpos, Py_ssize_t endpos,
6234 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 *exceptionObject = PyUnicodeEncodeError_Create(
6238 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
6240 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6242 goto onError;
6243 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6244 goto onError;
6245 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6246 goto onError;
6247 return;
6248 onError:
6249 Py_DECREF(*exceptionObject);
6250 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
6252}
6253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006255static void
6256raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006257 const char *encoding,
6258 const Py_UNICODE *unicode, Py_ssize_t size,
6259 Py_ssize_t startpos, Py_ssize_t endpos,
6260 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261{
6262 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266}
6267
6268/* error handling callback helper:
6269 build arguments, call the callback and check the arguments,
6270 put the result into newpos and return the replacement string, which
6271 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006272static PyObject *
6273unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006274 PyObject **errorHandler,
6275 const char *encoding, const char *reason,
6276 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6277 Py_ssize_t startpos, Py_ssize_t endpos,
6278 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006280 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281
6282 PyObject *restuple;
6283 PyObject *resunicode;
6284
6285 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 }
6290
6291 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295
6296 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006301 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 Py_DECREF(restuple);
6303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006305 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 &resunicode, newpos)) {
6307 Py_DECREF(restuple);
6308 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006309 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006310 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6311 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6312 Py_DECREF(restuple);
6313 return NULL;
6314 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006317 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6319 Py_DECREF(restuple);
6320 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322 Py_INCREF(resunicode);
6323 Py_DECREF(restuple);
6324 return resunicode;
6325}
6326
Alexander Belopolsky40018472011-02-26 01:02:56 +00006327static PyObject *
6328unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006329 Py_ssize_t size,
6330 const char *errors,
6331 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332{
6333 /* output object */
6334 PyObject *res;
6335 /* pointers to the beginning and end+1 of input */
6336 const Py_UNICODE *startp = p;
6337 const Py_UNICODE *endp = p + size;
6338 /* pointer to the beginning of the unencodable characters */
6339 /* const Py_UNICODE *badp = NULL; */
6340 /* pointer into the output */
6341 char *str;
6342 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006343 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006344 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6345 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 PyObject *errorHandler = NULL;
6347 PyObject *exc = NULL;
6348 /* the following variable is used for caching string comparisons
6349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6350 int known_errorHandler = -1;
6351
6352 /* allocate enough for a simple encoding without
6353 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006354 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006355 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006356 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006357 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006358 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006359 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 ressize = size;
6361
6362 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 /* can we encode this? */
6366 if (c<limit) {
6367 /* no overflow check, because we know that the space is enough */
6368 *str++ = (char)c;
6369 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 else {
6372 Py_ssize_t unicodepos = p-startp;
6373 Py_ssize_t requiredsize;
6374 PyObject *repunicode;
6375 Py_ssize_t repsize;
6376 Py_ssize_t newpos;
6377 Py_ssize_t respos;
6378 Py_UNICODE *uni2;
6379 /* startpos for collecting unencodable chars */
6380 const Py_UNICODE *collstart = p;
6381 const Py_UNICODE *collend = p;
6382 /* find all unecodable characters */
6383 while ((collend < endp) && ((*collend)>=limit))
6384 ++collend;
6385 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6386 if (known_errorHandler==-1) {
6387 if ((errors==NULL) || (!strcmp(errors, "strict")))
6388 known_errorHandler = 1;
6389 else if (!strcmp(errors, "replace"))
6390 known_errorHandler = 2;
6391 else if (!strcmp(errors, "ignore"))
6392 known_errorHandler = 3;
6393 else if (!strcmp(errors, "xmlcharrefreplace"))
6394 known_errorHandler = 4;
6395 else
6396 known_errorHandler = 0;
6397 }
6398 switch (known_errorHandler) {
6399 case 1: /* strict */
6400 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6401 goto onError;
6402 case 2: /* replace */
6403 while (collstart++<collend)
6404 *str++ = '?'; /* fall through */
6405 case 3: /* ignore */
6406 p = collend;
6407 break;
6408 case 4: /* xmlcharrefreplace */
6409 respos = str - PyBytes_AS_STRING(res);
6410 /* determine replacement size (temporarily (mis)uses p) */
6411 for (p = collstart, repsize = 0; p < collend; ++p) {
6412 if (*p<10)
6413 repsize += 2+1+1;
6414 else if (*p<100)
6415 repsize += 2+2+1;
6416 else if (*p<1000)
6417 repsize += 2+3+1;
6418 else if (*p<10000)
6419 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006420#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 else
6422 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006423#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 else if (*p<100000)
6425 repsize += 2+5+1;
6426 else if (*p<1000000)
6427 repsize += 2+6+1;
6428 else
6429 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006430#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 }
6432 requiredsize = respos+repsize+(endp-collend);
6433 if (requiredsize > ressize) {
6434 if (requiredsize<2*ressize)
6435 requiredsize = 2*ressize;
6436 if (_PyBytes_Resize(&res, requiredsize))
6437 goto onError;
6438 str = PyBytes_AS_STRING(res) + respos;
6439 ressize = requiredsize;
6440 }
6441 /* generate replacement (temporarily (mis)uses p) */
6442 for (p = collstart; p < collend; ++p) {
6443 str += sprintf(str, "&#%d;", (int)*p);
6444 }
6445 p = collend;
6446 break;
6447 default:
6448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6449 encoding, reason, startp, size, &exc,
6450 collstart-startp, collend-startp, &newpos);
6451 if (repunicode == NULL)
6452 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006453 if (PyBytes_Check(repunicode)) {
6454 /* Directly copy bytes result to output. */
6455 repsize = PyBytes_Size(repunicode);
6456 if (repsize > 1) {
6457 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006458 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006459 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6460 Py_DECREF(repunicode);
6461 goto onError;
6462 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006463 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006464 ressize += repsize-1;
6465 }
6466 memcpy(str, PyBytes_AsString(repunicode), repsize);
6467 str += repsize;
6468 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006469 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006470 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 /* need more space? (at least enough for what we
6473 have+the replacement+the rest of the string, so
6474 we won't have to check space for encodable characters) */
6475 respos = str - PyBytes_AS_STRING(res);
6476 repsize = PyUnicode_GET_SIZE(repunicode);
6477 requiredsize = respos+repsize+(endp-collend);
6478 if (requiredsize > ressize) {
6479 if (requiredsize<2*ressize)
6480 requiredsize = 2*ressize;
6481 if (_PyBytes_Resize(&res, requiredsize)) {
6482 Py_DECREF(repunicode);
6483 goto onError;
6484 }
6485 str = PyBytes_AS_STRING(res) + respos;
6486 ressize = requiredsize;
6487 }
6488 /* check if there is anything unencodable in the replacement
6489 and copy it to the output */
6490 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6491 c = *uni2;
6492 if (c >= limit) {
6493 raise_encode_exception(&exc, encoding, startp, size,
6494 unicodepos, unicodepos+1, reason);
6495 Py_DECREF(repunicode);
6496 goto onError;
6497 }
6498 *str = (char)c;
6499 }
6500 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006503 }
6504 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006505 /* Resize if we allocated to much */
6506 size = str - PyBytes_AS_STRING(res);
6507 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006508 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006509 if (_PyBytes_Resize(&res, size) < 0)
6510 goto onError;
6511 }
6512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 Py_XDECREF(errorHandler);
6514 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006515 return res;
6516
6517 onError:
6518 Py_XDECREF(res);
6519 Py_XDECREF(errorHandler);
6520 Py_XDECREF(exc);
6521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522}
6523
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524PyObject *
6525PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006526 Py_ssize_t size,
6527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006529 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
Alexander Belopolsky40018472011-02-26 01:02:56 +00006532PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006533_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534{
6535 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 PyErr_BadArgument();
6537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006539 if (PyUnicode_READY(unicode) == -1)
6540 return NULL;
6541 /* Fast path: if it is a one-byte string, construct
6542 bytes object directly. */
6543 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6544 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6545 PyUnicode_GET_LENGTH(unicode));
6546 /* Non-Latin-1 characters present. Defer to above function to
6547 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006550 errors);
6551}
6552
6553PyObject*
6554PyUnicode_AsLatin1String(PyObject *unicode)
6555{
6556 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557}
6558
6559/* --- 7-bit ASCII Codec -------------------------------------------------- */
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
6562PyUnicode_DecodeASCII(const char *s,
6563 Py_ssize_t size,
6564 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006568 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t startinpos;
6570 Py_ssize_t endinpos;
6571 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006573 int has_error;
6574 const unsigned char *p = (const unsigned char *)s;
6575 const unsigned char *end = p + size;
6576 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 PyObject *errorHandler = NULL;
6578 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006581 if (size == 1 && (unsigned char)s[0] < 128)
6582 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006583
Victor Stinner702c7342011-10-05 13:50:52 +02006584 has_error = 0;
6585 while (p < end && !has_error) {
6586 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6587 an explanation. */
6588 if (!((size_t) p & LONG_PTR_MASK)) {
6589 /* Help register allocation */
6590 register const unsigned char *_p = p;
6591 while (_p < aligned_end) {
6592 unsigned long value = *(unsigned long *) _p;
6593 if (value & ASCII_CHAR_MASK) {
6594 has_error = 1;
6595 break;
6596 }
6597 _p += SIZEOF_LONG;
6598 }
6599 if (_p == end)
6600 break;
6601 if (has_error)
6602 break;
6603 p = _p;
6604 }
6605 if (*p & 0x80) {
6606 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006607 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006608 }
6609 else {
6610 ++p;
6611 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006612 }
Victor Stinner702c7342011-10-05 13:50:52 +02006613 if (!has_error)
6614 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 v = _PyUnicode_New(size);
6617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006621 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 e = s + size;
6623 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 register unsigned char c = (unsigned char)*s;
6625 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006626 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 ++s;
6628 }
6629 else {
6630 startinpos = s-starts;
6631 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006632 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 if (unicode_decode_call_errorhandler(
6634 errors, &errorHandler,
6635 "ascii", "ordinal not in range(128)",
6636 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006637 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 goto onError;
6639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 }
Victor Stinner702c7342011-10-05 13:50:52 +02006641 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6642 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 Py_XDECREF(errorHandler);
6645 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006646#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006647 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006648 Py_DECREF(v);
6649 return NULL;
6650 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006651#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006652 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006654
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 Py_XDECREF(errorHandler);
6658 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 return NULL;
6660}
6661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662PyObject *
6663PyUnicode_EncodeASCII(const Py_UNICODE *p,
6664 Py_ssize_t size,
6665 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
6673 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 PyErr_BadArgument();
6675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006677 if (PyUnicode_READY(unicode) == -1)
6678 return NULL;
6679 /* Fast path: if it is an ASCII-only string, construct bytes object
6680 directly. Else defer to above function to raise the exception. */
6681 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6682 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6683 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686 errors);
6687}
6688
6689PyObject *
6690PyUnicode_AsASCIIString(PyObject *unicode)
6691{
6692 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693}
6694
Victor Stinner99b95382011-07-04 14:23:54 +02006695#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006696
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006697/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006698
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006699#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700#define NEED_RETRY
6701#endif
6702
6703/* XXX This code is limited to "true" double-byte encodings, as
6704 a) it assumes an incomplete character consists of a single byte, and
6705 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707
Alexander Belopolsky40018472011-02-26 01:02:56 +00006708static int
6709is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710{
6711 const char *curr = s + offset;
6712
6713 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 const char *prev = CharPrev(s, curr);
6715 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006716 }
6717 return 0;
6718}
6719
6720/*
6721 * Decode MBCS string into unicode object. If 'final' is set, converts
6722 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6723 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724static int
6725decode_mbcs(PyUnicodeObject **v,
6726 const char *s, /* MBCS string */
6727 int size, /* sizeof MBCS string */
6728 int final,
6729 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730{
6731 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006732 Py_ssize_t n;
6733 DWORD usize;
6734 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006735
6736 assert(size >= 0);
6737
Victor Stinner554f3f02010-06-16 23:33:54 +00006738 /* check and handle 'errors' arg */
6739 if (errors==NULL || strcmp(errors, "strict")==0)
6740 flags = MB_ERR_INVALID_CHARS;
6741 else if (strcmp(errors, "ignore")==0)
6742 flags = 0;
6743 else {
6744 PyErr_Format(PyExc_ValueError,
6745 "mbcs encoding does not support errors='%s'",
6746 errors);
6747 return -1;
6748 }
6749
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750 /* Skip trailing lead-byte unless 'final' is set */
6751 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006753
6754 /* First get the size of the result */
6755 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006756 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6757 if (usize==0)
6758 goto mbcs_decode_error;
6759 } else
6760 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761
6762 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* Create unicode object */
6764 *v = _PyUnicode_New(usize);
6765 if (*v == NULL)
6766 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006767 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768 }
6769 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* Extend unicode object */
6771 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006772 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 }
6775
6776 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006777 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6780 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006784
6785mbcs_decode_error:
6786 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6787 we raise a UnicodeDecodeError - else it is a 'generic'
6788 windows error
6789 */
6790 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6791 /* Ideally, we should get reason from FormatMessage - this
6792 is the Windows 2000 English version of the message
6793 */
6794 PyObject *exc = NULL;
6795 const char *reason = "No mapping for the Unicode character exists "
6796 "in the target multi-byte code page.";
6797 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6798 if (exc != NULL) {
6799 PyCodec_StrictErrors(exc);
6800 Py_DECREF(exc);
6801 }
6802 } else {
6803 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6804 }
6805 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806}
6807
Alexander Belopolsky40018472011-02-26 01:02:56 +00006808PyObject *
6809PyUnicode_DecodeMBCSStateful(const char *s,
6810 Py_ssize_t size,
6811 const char *errors,
6812 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813{
6814 PyUnicodeObject *v = NULL;
6815 int done;
6816
6817 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
6820#ifdef NEED_RETRY
6821 retry:
6822 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006823 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824 else
6825#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006826 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827
6828 if (done < 0) {
6829 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831 }
6832
6833 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835
6836#ifdef NEED_RETRY
6837 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 s += done;
6839 size -= done;
6840 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841 }
6842#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006843#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006844 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006845 Py_DECREF(v);
6846 return NULL;
6847 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006848#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006849 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850 return (PyObject *)v;
6851}
6852
Alexander Belopolsky40018472011-02-26 01:02:56 +00006853PyObject *
6854PyUnicode_DecodeMBCS(const char *s,
6855 Py_ssize_t size,
6856 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006857{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006858 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6859}
6860
6861/*
6862 * Convert unicode into string object (MBCS).
6863 * Returns 0 if succeed, -1 otherwise.
6864 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006865static int
6866encode_mbcs(PyObject **repr,
6867 const Py_UNICODE *p, /* unicode */
6868 int size, /* size of unicode */
6869 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870{
Victor Stinner554f3f02010-06-16 23:33:54 +00006871 BOOL usedDefaultChar = FALSE;
6872 BOOL *pusedDefaultChar;
6873 int mbcssize;
6874 Py_ssize_t n;
6875 PyObject *exc = NULL;
6876 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877
6878 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006879
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 /* check and handle 'errors' arg */
6881 if (errors==NULL || strcmp(errors, "strict")==0) {
6882 flags = WC_NO_BEST_FIT_CHARS;
6883 pusedDefaultChar = &usedDefaultChar;
6884 } else if (strcmp(errors, "replace")==0) {
6885 flags = 0;
6886 pusedDefaultChar = NULL;
6887 } else {
6888 PyErr_Format(PyExc_ValueError,
6889 "mbcs encoding does not support errors='%s'",
6890 errors);
6891 return -1;
6892 }
6893
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006894 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006896 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6897 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 if (mbcssize == 0) {
6899 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6900 return -1;
6901 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006902 /* If we used a default char, then we failed! */
6903 if (pusedDefaultChar && *pusedDefaultChar)
6904 goto mbcs_encode_error;
6905 } else {
6906 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006907 }
6908
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 /* Create string object */
6911 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6912 if (*repr == NULL)
6913 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006914 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915 }
6916 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 /* Extend string object */
6918 n = PyBytes_Size(*repr);
6919 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6920 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006921 }
6922
6923 /* Do the conversion */
6924 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006926 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6927 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6929 return -1;
6930 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006931 if (pusedDefaultChar && *pusedDefaultChar)
6932 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006933 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006935
6936mbcs_encode_error:
6937 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6938 Py_XDECREF(exc);
6939 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006940}
6941
Alexander Belopolsky40018472011-02-26 01:02:56 +00006942PyObject *
6943PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6944 Py_ssize_t size,
6945 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006946{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947 PyObject *repr = NULL;
6948 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006953 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954 else
6955#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006956 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 Py_XDECREF(repr);
6960 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006961 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962
6963#ifdef NEED_RETRY
6964 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 p += INT_MAX;
6966 size -= INT_MAX;
6967 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 }
6969#endif
6970
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006971 return repr;
6972}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006973
Alexander Belopolsky40018472011-02-26 01:02:56 +00006974PyObject *
6975PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006976{
6977 if (!PyUnicode_Check(unicode)) {
6978 PyErr_BadArgument();
6979 return NULL;
6980 }
6981 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 PyUnicode_GET_SIZE(unicode),
6983 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006984}
6985
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986#undef NEED_RETRY
6987
Victor Stinner99b95382011-07-04 14:23:54 +02006988#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006989
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990/* --- Character Mapping Codec -------------------------------------------- */
6991
Alexander Belopolsky40018472011-02-26 01:02:56 +00006992PyObject *
6993PyUnicode_DecodeCharmap(const char *s,
6994 Py_ssize_t size,
6995 PyObject *mapping,
6996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006999 Py_ssize_t startinpos;
7000 Py_ssize_t endinpos;
7001 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 PyUnicodeObject *v;
7004 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 PyObject *errorHandler = NULL;
7007 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007008 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007009 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 /* Default to Latin-1 */
7012 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014
7015 v = _PyUnicode_New(size);
7016 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007022 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 mapstring = PyUnicode_AS_UNICODE(mapping);
7024 maplen = PyUnicode_GET_SIZE(mapping);
7025 while (s < e) {
7026 unsigned char ch = *s;
7027 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 if (ch < maplen)
7030 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 if (x == 0xfffe) {
7033 /* undefined mapping */
7034 outpos = p-PyUnicode_AS_UNICODE(v);
7035 startinpos = s-starts;
7036 endinpos = startinpos+1;
7037 if (unicode_decode_call_errorhandler(
7038 errors, &errorHandler,
7039 "charmap", "character maps to <undefined>",
7040 &starts, &e, &startinpos, &endinpos, &exc, &s,
7041 &v, &outpos, &p)) {
7042 goto onError;
7043 }
7044 continue;
7045 }
7046 *p++ = x;
7047 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007048 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007049 }
7050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 while (s < e) {
7052 unsigned char ch = *s;
7053 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007054
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7056 w = PyLong_FromLong((long)ch);
7057 if (w == NULL)
7058 goto onError;
7059 x = PyObject_GetItem(mapping, w);
7060 Py_DECREF(w);
7061 if (x == NULL) {
7062 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7063 /* No mapping found means: mapping is undefined. */
7064 PyErr_Clear();
7065 x = Py_None;
7066 Py_INCREF(x);
7067 } else
7068 goto onError;
7069 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007070
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 /* Apply mapping */
7072 if (PyLong_Check(x)) {
7073 long value = PyLong_AS_LONG(x);
7074 if (value < 0 || value > 65535) {
7075 PyErr_SetString(PyExc_TypeError,
7076 "character mapping must be in range(65536)");
7077 Py_DECREF(x);
7078 goto onError;
7079 }
7080 *p++ = (Py_UNICODE)value;
7081 }
7082 else if (x == Py_None) {
7083 /* undefined mapping */
7084 outpos = p-PyUnicode_AS_UNICODE(v);
7085 startinpos = s-starts;
7086 endinpos = startinpos+1;
7087 if (unicode_decode_call_errorhandler(
7088 errors, &errorHandler,
7089 "charmap", "character maps to <undefined>",
7090 &starts, &e, &startinpos, &endinpos, &exc, &s,
7091 &v, &outpos, &p)) {
7092 Py_DECREF(x);
7093 goto onError;
7094 }
7095 Py_DECREF(x);
7096 continue;
7097 }
7098 else if (PyUnicode_Check(x)) {
7099 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007100
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 if (targetsize == 1)
7102 /* 1-1 mapping */
7103 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007104
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 else if (targetsize > 1) {
7106 /* 1-n mapping */
7107 if (targetsize > extrachars) {
7108 /* resize first */
7109 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7110 Py_ssize_t needed = (targetsize - extrachars) + \
7111 (targetsize << 2);
7112 extrachars += needed;
7113 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007114 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyUnicode_GET_SIZE(v) + needed) < 0) {
7116 Py_DECREF(x);
7117 goto onError;
7118 }
7119 p = PyUnicode_AS_UNICODE(v) + oldpos;
7120 }
7121 Py_UNICODE_COPY(p,
7122 PyUnicode_AS_UNICODE(x),
7123 targetsize);
7124 p += targetsize;
7125 extrachars -= targetsize;
7126 }
7127 /* 1-0 mapping: skip the character */
7128 }
7129 else {
7130 /* wrong return value */
7131 PyErr_SetString(PyExc_TypeError,
7132 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007133 Py_DECREF(x);
7134 goto onError;
7135 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 Py_DECREF(x);
7137 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 }
7140 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007141 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 Py_XDECREF(errorHandler);
7144 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007145#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007146 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007147 Py_DECREF(v);
7148 return NULL;
7149 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007150#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007151 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007153
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007155 Py_XDECREF(errorHandler);
7156 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 Py_XDECREF(v);
7158 return NULL;
7159}
7160
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007161/* Charmap encoding: the lookup table */
7162
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 PyObject_HEAD
7165 unsigned char level1[32];
7166 int count2, count3;
7167 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168};
7169
7170static PyObject*
7171encoding_map_size(PyObject *obj, PyObject* args)
7172{
7173 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007174 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007176}
7177
7178static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007179 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 PyDoc_STR("Return the size (in bytes) of this object") },
7181 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007182};
7183
7184static void
7185encoding_map_dealloc(PyObject* o)
7186{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007187 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188}
7189
7190static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007191 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 "EncodingMap", /*tp_name*/
7193 sizeof(struct encoding_map), /*tp_basicsize*/
7194 0, /*tp_itemsize*/
7195 /* methods */
7196 encoding_map_dealloc, /*tp_dealloc*/
7197 0, /*tp_print*/
7198 0, /*tp_getattr*/
7199 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007200 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 0, /*tp_repr*/
7202 0, /*tp_as_number*/
7203 0, /*tp_as_sequence*/
7204 0, /*tp_as_mapping*/
7205 0, /*tp_hash*/
7206 0, /*tp_call*/
7207 0, /*tp_str*/
7208 0, /*tp_getattro*/
7209 0, /*tp_setattro*/
7210 0, /*tp_as_buffer*/
7211 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7212 0, /*tp_doc*/
7213 0, /*tp_traverse*/
7214 0, /*tp_clear*/
7215 0, /*tp_richcompare*/
7216 0, /*tp_weaklistoffset*/
7217 0, /*tp_iter*/
7218 0, /*tp_iternext*/
7219 encoding_map_methods, /*tp_methods*/
7220 0, /*tp_members*/
7221 0, /*tp_getset*/
7222 0, /*tp_base*/
7223 0, /*tp_dict*/
7224 0, /*tp_descr_get*/
7225 0, /*tp_descr_set*/
7226 0, /*tp_dictoffset*/
7227 0, /*tp_init*/
7228 0, /*tp_alloc*/
7229 0, /*tp_new*/
7230 0, /*tp_free*/
7231 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007232};
7233
7234PyObject*
7235PyUnicode_BuildEncodingMap(PyObject* string)
7236{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007237 PyObject *result;
7238 struct encoding_map *mresult;
7239 int i;
7240 int need_dict = 0;
7241 unsigned char level1[32];
7242 unsigned char level2[512];
7243 unsigned char *mlevel1, *mlevel2, *mlevel3;
7244 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007245 int kind;
7246 void *data;
7247 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007249 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007250 PyErr_BadArgument();
7251 return NULL;
7252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253 kind = PyUnicode_KIND(string);
7254 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007255 memset(level1, 0xFF, sizeof level1);
7256 memset(level2, 0xFF, sizeof level2);
7257
7258 /* If there isn't a one-to-one mapping of NULL to \0,
7259 or if there are non-BMP characters, we need to use
7260 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007262 need_dict = 1;
7263 for (i = 1; i < 256; i++) {
7264 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 ch = PyUnicode_READ(kind, data, i);
7266 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 need_dict = 1;
7268 break;
7269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007270 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 /* unmapped character */
7272 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273 l1 = ch >> 11;
7274 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007275 if (level1[l1] == 0xFF)
7276 level1[l1] = count2++;
7277 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007278 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007279 }
7280
7281 if (count2 >= 0xFF || count3 >= 0xFF)
7282 need_dict = 1;
7283
7284 if (need_dict) {
7285 PyObject *result = PyDict_New();
7286 PyObject *key, *value;
7287 if (!result)
7288 return NULL;
7289 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007291 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007292 if (!key || !value)
7293 goto failed1;
7294 if (PyDict_SetItem(result, key, value) == -1)
7295 goto failed1;
7296 Py_DECREF(key);
7297 Py_DECREF(value);
7298 }
7299 return result;
7300 failed1:
7301 Py_XDECREF(key);
7302 Py_XDECREF(value);
7303 Py_DECREF(result);
7304 return NULL;
7305 }
7306
7307 /* Create a three-level trie */
7308 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7309 16*count2 + 128*count3 - 1);
7310 if (!result)
7311 return PyErr_NoMemory();
7312 PyObject_Init(result, &EncodingMapType);
7313 mresult = (struct encoding_map*)result;
7314 mresult->count2 = count2;
7315 mresult->count3 = count3;
7316 mlevel1 = mresult->level1;
7317 mlevel2 = mresult->level23;
7318 mlevel3 = mresult->level23 + 16*count2;
7319 memcpy(mlevel1, level1, 32);
7320 memset(mlevel2, 0xFF, 16*count2);
7321 memset(mlevel3, 0, 128*count3);
7322 count3 = 0;
7323 for (i = 1; i < 256; i++) {
7324 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007326 /* unmapped character */
7327 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007328 o1 = PyUnicode_READ(kind, data, i)>>11;
7329 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007330 i2 = 16*mlevel1[o1] + o2;
7331 if (mlevel2[i2] == 0xFF)
7332 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007334 i3 = 128*mlevel2[i2] + o3;
7335 mlevel3[i3] = i;
7336 }
7337 return result;
7338}
7339
7340static int
7341encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7342{
7343 struct encoding_map *map = (struct encoding_map*)mapping;
7344 int l1 = c>>11;
7345 int l2 = (c>>7) & 0xF;
7346 int l3 = c & 0x7F;
7347 int i;
7348
7349#ifdef Py_UNICODE_WIDE
7350 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007352 }
7353#endif
7354 if (c == 0)
7355 return 0;
7356 /* level 1*/
7357 i = map->level1[l1];
7358 if (i == 0xFF) {
7359 return -1;
7360 }
7361 /* level 2*/
7362 i = map->level23[16*i+l2];
7363 if (i == 0xFF) {
7364 return -1;
7365 }
7366 /* level 3 */
7367 i = map->level23[16*map->count2 + 128*i + l3];
7368 if (i == 0) {
7369 return -1;
7370 }
7371 return i;
7372}
7373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007374/* Lookup the character ch in the mapping. If the character
7375 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007376 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007377static PyObject *
7378charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379{
Christian Heimes217cfd12007-12-02 14:31:20 +00007380 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007381 PyObject *x;
7382
7383 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 x = PyObject_GetItem(mapping, w);
7386 Py_DECREF(w);
7387 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7389 /* No mapping found means: mapping is undefined. */
7390 PyErr_Clear();
7391 x = Py_None;
7392 Py_INCREF(x);
7393 return x;
7394 } else
7395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007397 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007399 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 long value = PyLong_AS_LONG(x);
7401 if (value < 0 || value > 255) {
7402 PyErr_SetString(PyExc_TypeError,
7403 "character mapping must be in range(256)");
7404 Py_DECREF(x);
7405 return NULL;
7406 }
7407 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007409 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 /* wrong return value */
7413 PyErr_Format(PyExc_TypeError,
7414 "character mapping must return integer, bytes or None, not %.400s",
7415 x->ob_type->tp_name);
7416 Py_DECREF(x);
7417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 }
7419}
7420
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007421static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007422charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007423{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007424 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7425 /* exponentially overallocate to minimize reallocations */
7426 if (requiredsize < 2*outsize)
7427 requiredsize = 2*outsize;
7428 if (_PyBytes_Resize(outobj, requiredsize))
7429 return -1;
7430 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007431}
7432
Benjamin Peterson14339b62009-01-31 16:36:08 +00007433typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007435} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007436/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007437 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007438 space is available. Return a new reference to the object that
7439 was put in the output buffer, or Py_None, if the mapping was undefined
7440 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007441 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007442static charmapencode_result
7443charmapencode_output(Py_UNICODE c, PyObject *mapping,
7444 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007446 PyObject *rep;
7447 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007448 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449
Christian Heimes90aa7642007-12-19 02:45:37 +00007450 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007451 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453 if (res == -1)
7454 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 if (outsize<requiredsize)
7456 if (charmapencode_resize(outobj, outpos, requiredsize))
7457 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007458 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 outstart[(*outpos)++] = (char)res;
7460 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007461 }
7462
7463 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007466 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 Py_DECREF(rep);
7468 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007469 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 if (PyLong_Check(rep)) {
7471 Py_ssize_t requiredsize = *outpos+1;
7472 if (outsize<requiredsize)
7473 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7474 Py_DECREF(rep);
7475 return enc_EXCEPTION;
7476 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007477 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 else {
7481 const char *repchars = PyBytes_AS_STRING(rep);
7482 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7483 Py_ssize_t requiredsize = *outpos+repsize;
7484 if (outsize<requiredsize)
7485 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7486 Py_DECREF(rep);
7487 return enc_EXCEPTION;
7488 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007489 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 memcpy(outstart + *outpos, repchars, repsize);
7491 *outpos += repsize;
7492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007493 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007494 Py_DECREF(rep);
7495 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007496}
7497
7498/* handle an error in PyUnicode_EncodeCharmap
7499 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007500static int
7501charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007504 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007505 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506{
7507 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007508 Py_ssize_t repsize;
7509 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510 Py_UNICODE *uni2;
7511 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007512 Py_ssize_t collstartpos = *inpos;
7513 Py_ssize_t collendpos = *inpos+1;
7514 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007515 char *encoding = "charmap";
7516 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007517 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519 /* find all unencodable characters */
7520 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007521 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007522 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 int res = encoding_map_lookup(p[collendpos], mapping);
7524 if (res != -1)
7525 break;
7526 ++collendpos;
7527 continue;
7528 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007529
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 rep = charmapencode_lookup(p[collendpos], mapping);
7531 if (rep==NULL)
7532 return -1;
7533 else if (rep!=Py_None) {
7534 Py_DECREF(rep);
7535 break;
7536 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007537 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 }
7540 /* cache callback name lookup
7541 * (if not done yet, i.e. it's the first error) */
7542 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 if ((errors==NULL) || (!strcmp(errors, "strict")))
7544 *known_errorHandler = 1;
7545 else if (!strcmp(errors, "replace"))
7546 *known_errorHandler = 2;
7547 else if (!strcmp(errors, "ignore"))
7548 *known_errorHandler = 3;
7549 else if (!strcmp(errors, "xmlcharrefreplace"))
7550 *known_errorHandler = 4;
7551 else
7552 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553 }
7554 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007555 case 1: /* strict */
7556 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7557 return -1;
7558 case 2: /* replace */
7559 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 x = charmapencode_output('?', mapping, res, respos);
7561 if (x==enc_EXCEPTION) {
7562 return -1;
7563 }
7564 else if (x==enc_FAILED) {
7565 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7566 return -1;
7567 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007568 }
7569 /* fall through */
7570 case 3: /* ignore */
7571 *inpos = collendpos;
7572 break;
7573 case 4: /* xmlcharrefreplace */
7574 /* generate replacement (temporarily (mis)uses p) */
7575 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 char buffer[2+29+1+1];
7577 char *cp;
7578 sprintf(buffer, "&#%d;", (int)p[collpos]);
7579 for (cp = buffer; *cp; ++cp) {
7580 x = charmapencode_output(*cp, mapping, res, respos);
7581 if (x==enc_EXCEPTION)
7582 return -1;
7583 else if (x==enc_FAILED) {
7584 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7585 return -1;
7586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007587 }
7588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007589 *inpos = collendpos;
7590 break;
7591 default:
7592 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 encoding, reason, p, size, exceptionObject,
7594 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007595 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007597 if (PyBytes_Check(repunicode)) {
7598 /* Directly copy bytes result to output. */
7599 Py_ssize_t outsize = PyBytes_Size(*res);
7600 Py_ssize_t requiredsize;
7601 repsize = PyBytes_Size(repunicode);
7602 requiredsize = *respos + repsize;
7603 if (requiredsize > outsize)
7604 /* Make room for all additional bytes. */
7605 if (charmapencode_resize(res, respos, requiredsize)) {
7606 Py_DECREF(repunicode);
7607 return -1;
7608 }
7609 memcpy(PyBytes_AsString(*res) + *respos,
7610 PyBytes_AsString(repunicode), repsize);
7611 *respos += repsize;
7612 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007613 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007614 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 /* generate replacement */
7617 repsize = PyUnicode_GET_SIZE(repunicode);
7618 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 x = charmapencode_output(*uni2, mapping, res, respos);
7620 if (x==enc_EXCEPTION) {
7621 return -1;
7622 }
7623 else if (x==enc_FAILED) {
7624 Py_DECREF(repunicode);
7625 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7626 return -1;
7627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 }
7629 *inpos = newpos;
7630 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007631 }
7632 return 0;
7633}
7634
Alexander Belopolsky40018472011-02-26 01:02:56 +00007635PyObject *
7636PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7637 Py_ssize_t size,
7638 PyObject *mapping,
7639 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 /* output object */
7642 PyObject *res = NULL;
7643 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007644 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007646 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647 PyObject *errorHandler = NULL;
7648 PyObject *exc = NULL;
7649 /* the following variable is used for caching string comparisons
7650 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7651 * 3=ignore, 4=xmlcharrefreplace */
7652 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
7654 /* Default to Latin-1 */
7655 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 /* allocate enough for a simple encoding without
7659 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007660 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 if (res == NULL)
7662 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007663 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007666 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 /* try to encode it */
7668 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7669 if (x==enc_EXCEPTION) /* error */
7670 goto onError;
7671 if (x==enc_FAILED) { /* unencodable character */
7672 if (charmap_encoding_error(p, size, &inpos, mapping,
7673 &exc,
7674 &known_errorHandler, &errorHandler, errors,
7675 &res, &respos)) {
7676 goto onError;
7677 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007678 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 else
7680 /* done with this character => adjust input position */
7681 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007684 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007685 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007686 if (_PyBytes_Resize(&res, respos) < 0)
7687 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 Py_XDECREF(exc);
7690 Py_XDECREF(errorHandler);
7691 return res;
7692
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 Py_XDECREF(res);
7695 Py_XDECREF(exc);
7696 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 return NULL;
7698}
7699
Alexander Belopolsky40018472011-02-26 01:02:56 +00007700PyObject *
7701PyUnicode_AsCharmapString(PyObject *unicode,
7702 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
7704 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 PyErr_BadArgument();
7706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 }
7708 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 PyUnicode_GET_SIZE(unicode),
7710 mapping,
7711 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007715static void
7716make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007718 Py_ssize_t startpos, Py_ssize_t endpos,
7719 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007721 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007722 *exceptionObject = _PyUnicodeTranslateError_Create(
7723 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 }
7725 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7727 goto onError;
7728 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7729 goto onError;
7730 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7731 goto onError;
7732 return;
7733 onError:
7734 Py_DECREF(*exceptionObject);
7735 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 }
7737}
7738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740static void
7741raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743 Py_ssize_t startpos, Py_ssize_t endpos,
7744 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745{
7746 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007748 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750}
7751
7752/* error handling callback helper:
7753 build arguments, call the callback and check the arguments,
7754 put the result into newpos and return the replacement string, which
7755 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007756static PyObject *
7757unicode_translate_call_errorhandler(const char *errors,
7758 PyObject **errorHandler,
7759 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007760 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007761 Py_ssize_t startpos, Py_ssize_t endpos,
7762 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007764 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007766 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767 PyObject *restuple;
7768 PyObject *resunicode;
7769
7770 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007772 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007774 }
7775
7776 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007777 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007778 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007780
7781 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007783 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007786 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 Py_DECREF(restuple);
7788 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789 }
7790 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 &resunicode, &i_newpos)) {
7792 Py_DECREF(restuple);
7793 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007794 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007795 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007797 else
7798 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7801 Py_DECREF(restuple);
7802 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007804 Py_INCREF(resunicode);
7805 Py_DECREF(restuple);
7806 return resunicode;
7807}
7808
7809/* Lookup the character ch in the mapping and put the result in result,
7810 which must be decrefed by the caller.
7811 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007812static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007813charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814{
Christian Heimes217cfd12007-12-02 14:31:20 +00007815 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 PyObject *x;
7817
7818 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820 x = PyObject_GetItem(mapping, w);
7821 Py_DECREF(w);
7822 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7824 /* No mapping found means: use 1:1 mapping. */
7825 PyErr_Clear();
7826 *result = NULL;
7827 return 0;
7828 } else
7829 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830 }
7831 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 *result = x;
7833 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007834 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007835 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 long value = PyLong_AS_LONG(x);
7837 long max = PyUnicode_GetMax();
7838 if (value < 0 || value > max) {
7839 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007840 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 Py_DECREF(x);
7842 return -1;
7843 }
7844 *result = x;
7845 return 0;
7846 }
7847 else if (PyUnicode_Check(x)) {
7848 *result = x;
7849 return 0;
7850 }
7851 else {
7852 /* wrong return value */
7853 PyErr_SetString(PyExc_TypeError,
7854 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007855 Py_DECREF(x);
7856 return -1;
7857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007858}
7859/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 if not reallocate and adjust various state variables.
7861 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007862static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007867 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 /* exponentially overallocate to minimize reallocations */
7869 if (requiredsize < 2 * oldsize)
7870 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7872 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007874 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875 }
7876 return 0;
7877}
7878/* lookup the character, put the result in the output string and adjust
7879 various state variables. Return a new reference to the object that
7880 was put in the output buffer in *result, or Py_None, if the mapping was
7881 undefined (in which case no character was written).
7882 The called must decref result.
7883 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7886 PyObject *mapping, Py_UCS4 **output,
7887 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007888 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7891 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007896 }
7897 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007899 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007901 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902 }
7903 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 Py_ssize_t repsize;
7905 if (PyUnicode_READY(*res) == -1)
7906 return -1;
7907 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 if (repsize==1) {
7909 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 }
7912 else if (repsize!=0) {
7913 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 Py_ssize_t requiredsize = *opos +
7915 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 Py_ssize_t i;
7918 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 for(i = 0; i < repsize; i++)
7921 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 }
7924 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926 return 0;
7927}
7928
Alexander Belopolsky40018472011-02-26 01:02:56 +00007929PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930_PyUnicode_TranslateCharmap(PyObject *input,
7931 PyObject *mapping,
7932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 /* input object */
7935 char *idata;
7936 Py_ssize_t size, i;
7937 int kind;
7938 /* output buffer */
7939 Py_UCS4 *output = NULL;
7940 Py_ssize_t osize;
7941 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007942 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944 char *reason = "character maps to <undefined>";
7945 PyObject *errorHandler = NULL;
7946 PyObject *exc = NULL;
7947 /* the following variable is used for caching string comparisons
7948 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7949 * 3=ignore, 4=xmlcharrefreplace */
7950 int known_errorHandler = -1;
7951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 PyErr_BadArgument();
7954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007957 if (PyUnicode_READY(input) == -1)
7958 return NULL;
7959 idata = (char*)PyUnicode_DATA(input);
7960 kind = PyUnicode_KIND(input);
7961 size = PyUnicode_GET_LENGTH(input);
7962 i = 0;
7963
7964 if (size == 0) {
7965 Py_INCREF(input);
7966 return input;
7967 }
7968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 /* allocate enough for a simple 1:1 translation without
7970 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 osize = size;
7972 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7973 opos = 0;
7974 if (output == NULL) {
7975 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 /* try to encode it */
7981 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982 if (charmaptranslate_output(input, i, mapping,
7983 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 Py_XDECREF(x);
7985 goto onError;
7986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 else { /* untranslatable character */
7991 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7992 Py_ssize_t repsize;
7993 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007994 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007996 Py_ssize_t collstart = i;
7997 Py_ssize_t collend = i+1;
7998 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 while (collend < size) {
8002 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 goto onError;
8004 Py_XDECREF(x);
8005 if (x!=Py_None)
8006 break;
8007 ++collend;
8008 }
8009 /* cache callback name lookup
8010 * (if not done yet, i.e. it's the first error) */
8011 if (known_errorHandler==-1) {
8012 if ((errors==NULL) || (!strcmp(errors, "strict")))
8013 known_errorHandler = 1;
8014 else if (!strcmp(errors, "replace"))
8015 known_errorHandler = 2;
8016 else if (!strcmp(errors, "ignore"))
8017 known_errorHandler = 3;
8018 else if (!strcmp(errors, "xmlcharrefreplace"))
8019 known_errorHandler = 4;
8020 else
8021 known_errorHandler = 0;
8022 }
8023 switch (known_errorHandler) {
8024 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008025 raise_translate_exception(&exc, input, collstart,
8026 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 case 2: /* replace */
8029 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008030 for (coll = collstart; coll<collend; coll++)
8031 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 /* fall through */
8033 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008034 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 break;
8036 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037 /* generate replacement (temporarily (mis)uses i) */
8038 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 char buffer[2+29+1+1];
8040 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8042 if (charmaptranslate_makespace(&output, &osize,
8043 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 goto onError;
8045 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 break;
8050 default:
8051 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008052 reason, input, &exc,
8053 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008054 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 goto onError;
8056 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008057 repsize = PyUnicode_GET_LENGTH(repunicode);
8058 if (charmaptranslate_makespace(&output, &osize,
8059 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 Py_DECREF(repunicode);
8061 goto onError;
8062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008063 for (uni2 = 0; repsize-->0; ++uni2)
8064 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8065 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 }
8069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8071 if (!res)
8072 goto onError;
8073 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074 Py_XDECREF(exc);
8075 Py_XDECREF(errorHandler);
8076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 Py_XDECREF(exc);
8081 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 return NULL;
8083}
8084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085/* Deprecated. Use PyUnicode_Translate instead. */
8086PyObject *
8087PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8088 Py_ssize_t size,
8089 PyObject *mapping,
8090 const char *errors)
8091{
8092 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8093 if (!unicode)
8094 return NULL;
8095 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8096}
8097
Alexander Belopolsky40018472011-02-26 01:02:56 +00008098PyObject *
8099PyUnicode_Translate(PyObject *str,
8100 PyObject *mapping,
8101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102{
8103 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008104
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 str = PyUnicode_FromObject(str);
8106 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 Py_DECREF(str);
8110 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008111
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 Py_XDECREF(str);
8114 return NULL;
8115}
Tim Petersced69f82003-09-16 20:30:58 +00008116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008117static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008118fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119{
8120 /* No need to call PyUnicode_READY(self) because this function is only
8121 called as a callback from fixup() which does it already. */
8122 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8123 const int kind = PyUnicode_KIND(self);
8124 void *data = PyUnicode_DATA(self);
8125 Py_UCS4 maxchar = 0, ch, fixed;
8126 Py_ssize_t i;
8127
8128 for (i = 0; i < len; ++i) {
8129 ch = PyUnicode_READ(kind, data, i);
8130 fixed = 0;
8131 if (ch > 127) {
8132 if (Py_UNICODE_ISSPACE(ch))
8133 fixed = ' ';
8134 else {
8135 const int decimal = Py_UNICODE_TODECIMAL(ch);
8136 if (decimal >= 0)
8137 fixed = '0' + decimal;
8138 }
8139 if (fixed != 0) {
8140 if (fixed > maxchar)
8141 maxchar = fixed;
8142 PyUnicode_WRITE(kind, data, i, fixed);
8143 }
8144 else if (ch > maxchar)
8145 maxchar = ch;
8146 }
8147 else if (ch > maxchar)
8148 maxchar = ch;
8149 }
8150
8151 return maxchar;
8152}
8153
8154PyObject *
8155_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8156{
8157 if (!PyUnicode_Check(unicode)) {
8158 PyErr_BadInternalCall();
8159 return NULL;
8160 }
8161 if (PyUnicode_READY(unicode) == -1)
8162 return NULL;
8163 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8164 /* If the string is already ASCII, just return the same string */
8165 Py_INCREF(unicode);
8166 return unicode;
8167 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008168 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169}
8170
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008171PyObject *
8172PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8173 Py_ssize_t length)
8174{
8175 PyObject *result;
8176 Py_UNICODE *p; /* write pointer into result */
8177 Py_ssize_t i;
8178 /* Copy to a new string */
8179 result = (PyObject *)_PyUnicode_New(length);
8180 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8181 if (result == NULL)
8182 return result;
8183 p = PyUnicode_AS_UNICODE(result);
8184 /* Iterate over code points */
8185 for (i = 0; i < length; i++) {
8186 Py_UNICODE ch =s[i];
8187 if (ch > 127) {
8188 int decimal = Py_UNICODE_TODECIMAL(ch);
8189 if (decimal >= 0)
8190 p[i] = '0' + decimal;
8191 }
8192 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008193#ifndef DONT_MAKE_RESULT_READY
8194 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 Py_DECREF(result);
8196 return NULL;
8197 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008198#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008199 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008200 return result;
8201}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008202/* --- Decimal Encoder ---------------------------------------------------- */
8203
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204int
8205PyUnicode_EncodeDecimal(Py_UNICODE *s,
8206 Py_ssize_t length,
8207 char *output,
8208 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008209{
8210 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 PyObject *errorHandler = NULL;
8212 PyObject *exc = NULL;
8213 const char *encoding = "decimal";
8214 const char *reason = "invalid decimal Unicode string";
8215 /* the following variable is used for caching string comparisons
8216 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8217 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008218
8219 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 PyErr_BadArgument();
8221 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008222 }
8223
8224 p = s;
8225 end = s + length;
8226 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 register Py_UNICODE ch = *p;
8228 int decimal;
8229 PyObject *repunicode;
8230 Py_ssize_t repsize;
8231 Py_ssize_t newpos;
8232 Py_UNICODE *uni2;
8233 Py_UNICODE *collstart;
8234 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008235
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 ++p;
8239 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 decimal = Py_UNICODE_TODECIMAL(ch);
8242 if (decimal >= 0) {
8243 *output++ = '0' + decimal;
8244 ++p;
8245 continue;
8246 }
8247 if (0 < ch && ch < 256) {
8248 *output++ = (char)ch;
8249 ++p;
8250 continue;
8251 }
8252 /* All other characters are considered unencodable */
8253 collstart = p;
8254 collend = p+1;
8255 while (collend < end) {
8256 if ((0 < *collend && *collend < 256) ||
8257 !Py_UNICODE_ISSPACE(*collend) ||
8258 Py_UNICODE_TODECIMAL(*collend))
8259 break;
8260 }
8261 /* cache callback name lookup
8262 * (if not done yet, i.e. it's the first error) */
8263 if (known_errorHandler==-1) {
8264 if ((errors==NULL) || (!strcmp(errors, "strict")))
8265 known_errorHandler = 1;
8266 else if (!strcmp(errors, "replace"))
8267 known_errorHandler = 2;
8268 else if (!strcmp(errors, "ignore"))
8269 known_errorHandler = 3;
8270 else if (!strcmp(errors, "xmlcharrefreplace"))
8271 known_errorHandler = 4;
8272 else
8273 known_errorHandler = 0;
8274 }
8275 switch (known_errorHandler) {
8276 case 1: /* strict */
8277 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8278 goto onError;
8279 case 2: /* replace */
8280 for (p = collstart; p < collend; ++p)
8281 *output++ = '?';
8282 /* fall through */
8283 case 3: /* ignore */
8284 p = collend;
8285 break;
8286 case 4: /* xmlcharrefreplace */
8287 /* generate replacement (temporarily (mis)uses p) */
8288 for (p = collstart; p < collend; ++p)
8289 output += sprintf(output, "&#%d;", (int)*p);
8290 p = collend;
8291 break;
8292 default:
8293 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8294 encoding, reason, s, length, &exc,
8295 collstart-s, collend-s, &newpos);
8296 if (repunicode == NULL)
8297 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008298 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008299 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008300 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8301 Py_DECREF(repunicode);
8302 goto onError;
8303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 /* generate replacement */
8305 repsize = PyUnicode_GET_SIZE(repunicode);
8306 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8307 Py_UNICODE ch = *uni2;
8308 if (Py_UNICODE_ISSPACE(ch))
8309 *output++ = ' ';
8310 else {
8311 decimal = Py_UNICODE_TODECIMAL(ch);
8312 if (decimal >= 0)
8313 *output++ = '0' + decimal;
8314 else if (0 < ch && ch < 256)
8315 *output++ = (char)ch;
8316 else {
8317 Py_DECREF(repunicode);
8318 raise_encode_exception(&exc, encoding,
8319 s, length, collstart-s, collend-s, reason);
8320 goto onError;
8321 }
8322 }
8323 }
8324 p = s + newpos;
8325 Py_DECREF(repunicode);
8326 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008327 }
8328 /* 0-terminate the output string */
8329 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 Py_XDECREF(exc);
8331 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008332 return 0;
8333
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 Py_XDECREF(exc);
8336 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008337 return -1;
8338}
8339
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340/* --- Helpers ------------------------------------------------------------ */
8341
Victor Stinnerc3cec782011-10-05 21:24:08 +02008342#include "stringlib/asciilib.h"
8343#include "stringlib/fastsearch.h"
8344#include "stringlib/partition.h"
8345#include "stringlib/split.h"
8346#include "stringlib/count.h"
8347#include "stringlib/find.h"
8348#include "stringlib/localeutil.h"
8349#include "stringlib/undef.h"
8350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351#include "stringlib/ucs1lib.h"
8352#include "stringlib/fastsearch.h"
8353#include "stringlib/partition.h"
8354#include "stringlib/split.h"
8355#include "stringlib/count.h"
8356#include "stringlib/find.h"
8357#include "stringlib/localeutil.h"
8358#include "stringlib/undef.h"
8359
8360#include "stringlib/ucs2lib.h"
8361#include "stringlib/fastsearch.h"
8362#include "stringlib/partition.h"
8363#include "stringlib/split.h"
8364#include "stringlib/count.h"
8365#include "stringlib/find.h"
8366#include "stringlib/localeutil.h"
8367#include "stringlib/undef.h"
8368
8369#include "stringlib/ucs4lib.h"
8370#include "stringlib/fastsearch.h"
8371#include "stringlib/partition.h"
8372#include "stringlib/split.h"
8373#include "stringlib/count.h"
8374#include "stringlib/find.h"
8375#include "stringlib/localeutil.h"
8376#include "stringlib/undef.h"
8377
8378static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008379any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8380 const Py_UCS1*, Py_ssize_t,
8381 Py_ssize_t, Py_ssize_t),
8382 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 const Py_UCS1*, Py_ssize_t,
8384 Py_ssize_t, Py_ssize_t),
8385 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8386 const Py_UCS2*, Py_ssize_t,
8387 Py_ssize_t, Py_ssize_t),
8388 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8389 const Py_UCS4*, Py_ssize_t,
8390 Py_ssize_t, Py_ssize_t),
8391 PyObject* s1, PyObject* s2,
8392 Py_ssize_t start,
8393 Py_ssize_t end)
8394{
8395 int kind1, kind2, kind;
8396 void *buf1, *buf2;
8397 Py_ssize_t len1, len2, result;
8398
8399 kind1 = PyUnicode_KIND(s1);
8400 kind2 = PyUnicode_KIND(s2);
8401 kind = kind1 > kind2 ? kind1 : kind2;
8402 buf1 = PyUnicode_DATA(s1);
8403 buf2 = PyUnicode_DATA(s2);
8404 if (kind1 != kind)
8405 buf1 = _PyUnicode_AsKind(s1, kind);
8406 if (!buf1)
8407 return -2;
8408 if (kind2 != kind)
8409 buf2 = _PyUnicode_AsKind(s2, kind);
8410 if (!buf2) {
8411 if (kind1 != kind) PyMem_Free(buf1);
8412 return -2;
8413 }
8414 len1 = PyUnicode_GET_LENGTH(s1);
8415 len2 = PyUnicode_GET_LENGTH(s2);
8416
8417 switch(kind) {
8418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008419 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8420 result = ascii(buf1, len1, buf2, len2, start, end);
8421 else
8422 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 break;
8424 case PyUnicode_2BYTE_KIND:
8425 result = ucs2(buf1, len1, buf2, len2, start, end);
8426 break;
8427 case PyUnicode_4BYTE_KIND:
8428 result = ucs4(buf1, len1, buf2, len2, start, end);
8429 break;
8430 default:
8431 assert(0); result = -2;
8432 }
8433
8434 if (kind1 != kind)
8435 PyMem_Free(buf1);
8436 if (kind2 != kind)
8437 PyMem_Free(buf2);
8438
8439 return result;
8440}
8441
8442Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008443_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 Py_ssize_t n_buffer,
8445 void *digits, Py_ssize_t n_digits,
8446 Py_ssize_t min_width,
8447 const char *grouping,
8448 const char *thousands_sep)
8449{
8450 switch(kind) {
8451 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008452 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8453 return _PyUnicode_ascii_InsertThousandsGrouping(
8454 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8455 min_width, grouping, thousands_sep);
8456 else
8457 return _PyUnicode_ucs1_InsertThousandsGrouping(
8458 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8459 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 case PyUnicode_2BYTE_KIND:
8461 return _PyUnicode_ucs2_InsertThousandsGrouping(
8462 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8463 min_width, grouping, thousands_sep);
8464 case PyUnicode_4BYTE_KIND:
8465 return _PyUnicode_ucs4_InsertThousandsGrouping(
8466 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8467 min_width, grouping, thousands_sep);
8468 }
8469 assert(0);
8470 return -1;
8471}
8472
8473
Eric Smith8c663262007-08-25 02:26:07 +00008474#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008475#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008476
Thomas Wouters477c8d52006-05-27 19:21:47 +00008477#include "stringlib/count.h"
8478#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008479
Thomas Wouters477c8d52006-05-27 19:21:47 +00008480/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008481#define ADJUST_INDICES(start, end, len) \
8482 if (end > len) \
8483 end = len; \
8484 else if (end < 0) { \
8485 end += len; \
8486 if (end < 0) \
8487 end = 0; \
8488 } \
8489 if (start < 0) { \
8490 start += len; \
8491 if (start < 0) \
8492 start = 0; \
8493 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008494
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495Py_ssize_t
8496PyUnicode_Count(PyObject *str,
8497 PyObject *substr,
8498 Py_ssize_t start,
8499 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008501 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008502 PyUnicodeObject* str_obj;
8503 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 int kind1, kind2, kind;
8505 void *buf1 = NULL, *buf2 = NULL;
8506 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008507
Thomas Wouters477c8d52006-05-27 19:21:47 +00008508 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008511 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008512 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 Py_DECREF(str_obj);
8514 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 }
Tim Petersced69f82003-09-16 20:30:58 +00008516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 kind1 = PyUnicode_KIND(str_obj);
8518 kind2 = PyUnicode_KIND(sub_obj);
8519 kind = kind1 > kind2 ? kind1 : kind2;
8520 buf1 = PyUnicode_DATA(str_obj);
8521 if (kind1 != kind)
8522 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8523 if (!buf1)
8524 goto onError;
8525 buf2 = PyUnicode_DATA(sub_obj);
8526 if (kind2 != kind)
8527 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8528 if (!buf2)
8529 goto onError;
8530 len1 = PyUnicode_GET_LENGTH(str_obj);
8531 len2 = PyUnicode_GET_LENGTH(sub_obj);
8532
8533 ADJUST_INDICES(start, end, len1);
8534 switch(kind) {
8535 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008536 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8537 result = asciilib_count(
8538 ((Py_UCS1*)buf1) + start, end - start,
8539 buf2, len2, PY_SSIZE_T_MAX
8540 );
8541 else
8542 result = ucs1lib_count(
8543 ((Py_UCS1*)buf1) + start, end - start,
8544 buf2, len2, PY_SSIZE_T_MAX
8545 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 break;
8547 case PyUnicode_2BYTE_KIND:
8548 result = ucs2lib_count(
8549 ((Py_UCS2*)buf1) + start, end - start,
8550 buf2, len2, PY_SSIZE_T_MAX
8551 );
8552 break;
8553 case PyUnicode_4BYTE_KIND:
8554 result = ucs4lib_count(
8555 ((Py_UCS4*)buf1) + start, end - start,
8556 buf2, len2, PY_SSIZE_T_MAX
8557 );
8558 break;
8559 default:
8560 assert(0); result = 0;
8561 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008562
8563 Py_DECREF(sub_obj);
8564 Py_DECREF(str_obj);
8565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 if (kind1 != kind)
8567 PyMem_Free(buf1);
8568 if (kind2 != kind)
8569 PyMem_Free(buf2);
8570
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 onError:
8573 Py_DECREF(sub_obj);
8574 Py_DECREF(str_obj);
8575 if (kind1 != kind && buf1)
8576 PyMem_Free(buf1);
8577 if (kind2 != kind && buf2)
8578 PyMem_Free(buf2);
8579 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582Py_ssize_t
8583PyUnicode_Find(PyObject *str,
8584 PyObject *sub,
8585 Py_ssize_t start,
8586 Py_ssize_t end,
8587 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008589 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008594 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 Py_DECREF(str);
8597 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 }
Tim Petersced69f82003-09-16 20:30:58 +00008599
Thomas Wouters477c8d52006-05-27 19:21:47 +00008600 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008602 asciilib_find_slice, ucs1lib_find_slice,
8603 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008605 );
8606 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008608 asciilib_find_slice, ucs1lib_rfind_slice,
8609 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008611 );
8612
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614 Py_DECREF(sub);
8615
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 return result;
8617}
8618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619Py_ssize_t
8620PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8621 Py_ssize_t start, Py_ssize_t end,
8622 int direction)
8623{
8624 char *result;
8625 int kind;
8626 if (PyUnicode_READY(str) == -1)
8627 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008628 if (start < 0 || end < 0) {
8629 PyErr_SetString(PyExc_IndexError, "string index out of range");
8630 return -2;
8631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 if (end > PyUnicode_GET_LENGTH(str))
8633 end = PyUnicode_GET_LENGTH(str);
8634 kind = PyUnicode_KIND(str);
8635 result = findchar(PyUnicode_1BYTE_DATA(str)
8636 + PyUnicode_KIND_SIZE(kind, start),
8637 kind,
8638 end-start, ch, direction);
8639 if (!result)
8640 return -1;
8641 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8642}
8643
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644static int
8645tailmatch(PyUnicodeObject *self,
8646 PyUnicodeObject *substring,
8647 Py_ssize_t start,
8648 Py_ssize_t end,
8649 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 int kind_self;
8652 int kind_sub;
8653 void *data_self;
8654 void *data_sub;
8655 Py_ssize_t offset;
8656 Py_ssize_t i;
8657 Py_ssize_t end_sub;
8658
8659 if (PyUnicode_READY(self) == -1 ||
8660 PyUnicode_READY(substring) == -1)
8661 return 0;
8662
8663 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 return 1;
8665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8667 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 kind_self = PyUnicode_KIND(self);
8672 data_self = PyUnicode_DATA(self);
8673 kind_sub = PyUnicode_KIND(substring);
8674 data_sub = PyUnicode_DATA(substring);
8675 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8676
8677 if (direction > 0)
8678 offset = end;
8679 else
8680 offset = start;
8681
8682 if (PyUnicode_READ(kind_self, data_self, offset) ==
8683 PyUnicode_READ(kind_sub, data_sub, 0) &&
8684 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8685 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8686 /* If both are of the same kind, memcmp is sufficient */
8687 if (kind_self == kind_sub) {
8688 return ! memcmp((char *)data_self +
8689 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8690 data_sub,
8691 PyUnicode_GET_LENGTH(substring) *
8692 PyUnicode_CHARACTER_SIZE(substring));
8693 }
8694 /* otherwise we have to compare each character by first accesing it */
8695 else {
8696 /* We do not need to compare 0 and len(substring)-1 because
8697 the if statement above ensured already that they are equal
8698 when we end up here. */
8699 // TODO: honor direction and do a forward or backwards search
8700 for (i = 1; i < end_sub; ++i) {
8701 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8702 PyUnicode_READ(kind_sub, data_sub, i))
8703 return 0;
8704 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 }
8708
8709 return 0;
8710}
8711
Alexander Belopolsky40018472011-02-26 01:02:56 +00008712Py_ssize_t
8713PyUnicode_Tailmatch(PyObject *str,
8714 PyObject *substr,
8715 Py_ssize_t start,
8716 Py_ssize_t end,
8717 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008719 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008720
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 str = PyUnicode_FromObject(str);
8722 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 substr = PyUnicode_FromObject(substr);
8725 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 Py_DECREF(str);
8727 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 }
Tim Petersced69f82003-09-16 20:30:58 +00008729
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 (PyUnicodeObject *)substr,
8732 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 Py_DECREF(str);
8734 Py_DECREF(substr);
8735 return result;
8736}
8737
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738/* Apply fixfct filter to the Unicode object self and return a
8739 reference to the modified object */
8740
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008742fixup(PyObject *self,
8743 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 PyObject *u;
8746 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 if (PyUnicode_READY(self) == -1)
8749 return NULL;
8750 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8751 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8752 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8757 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 /* fix functions return the new maximum character in a string,
8760 if the kind of the resulting unicode object does not change,
8761 everything is fine. Otherwise we need to change the string kind
8762 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008763 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 if (maxchar_new == 0)
8765 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8766 else if (maxchar_new <= 127)
8767 maxchar_new = 127;
8768 else if (maxchar_new <= 255)
8769 maxchar_new = 255;
8770 else if (maxchar_new <= 65535)
8771 maxchar_new = 65535;
8772 else
8773 maxchar_new = 1114111; /* 0x10ffff */
8774
8775 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 /* fixfct should return TRUE if it modified the buffer. If
8777 FALSE, return a reference to the original buffer instead
8778 (to save space, not time) */
8779 Py_INCREF(self);
8780 Py_DECREF(u);
8781 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 else if (maxchar_new == maxchar_old) {
8784 return u;
8785 }
8786 else {
8787 /* In case the maximum character changed, we need to
8788 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008789 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 if (v == NULL) {
8791 Py_DECREF(u);
8792 return NULL;
8793 }
8794 if (maxchar_new > maxchar_old) {
8795 /* If the maxchar increased so that the kind changed, not all
8796 characters are representable anymore and we need to fix the
8797 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008798 if (PyUnicode_CopyCharacters(v, 0,
8799 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008800 PyUnicode_GET_LENGTH(self)) < 0)
8801 {
8802 Py_DECREF(u);
8803 return NULL;
8804 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008805 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8807 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008808 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008809 if (PyUnicode_CopyCharacters(v, 0,
8810 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008811 PyUnicode_GET_LENGTH(self)) < 0)
8812 {
8813 Py_DECREF(u);
8814 return NULL;
8815 }
8816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817
8818 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008819 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 return v;
8821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822}
8823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008825fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 /* No need to call PyUnicode_READY(self) because this function is only
8828 called as a callback from fixup() which does it already. */
8829 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8830 const int kind = PyUnicode_KIND(self);
8831 void *data = PyUnicode_DATA(self);
8832 int touched = 0;
8833 Py_UCS4 maxchar = 0;
8834 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 for (i = 0; i < len; ++i) {
8837 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8838 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8839 if (up != ch) {
8840 if (up > maxchar)
8841 maxchar = up;
8842 PyUnicode_WRITE(kind, data, i, up);
8843 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 else if (ch > maxchar)
8846 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 }
8848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 if (touched)
8850 return maxchar;
8851 else
8852 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853}
8854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008856fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8859 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8860 const int kind = PyUnicode_KIND(self);
8861 void *data = PyUnicode_DATA(self);
8862 int touched = 0;
8863 Py_UCS4 maxchar = 0;
8864 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 for(i = 0; i < len; ++i) {
8867 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8868 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8869 if (lo != ch) {
8870 if (lo > maxchar)
8871 maxchar = lo;
8872 PyUnicode_WRITE(kind, data, i, lo);
8873 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 else if (ch > maxchar)
8876 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 }
8878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 if (touched)
8880 return maxchar;
8881 else
8882 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883}
8884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008886fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8889 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8890 const int kind = PyUnicode_KIND(self);
8891 void *data = PyUnicode_DATA(self);
8892 int touched = 0;
8893 Py_UCS4 maxchar = 0;
8894 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 for(i = 0; i < len; ++i) {
8897 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8898 Py_UCS4 nu = 0;
8899
8900 if (Py_UNICODE_ISUPPER(ch))
8901 nu = Py_UNICODE_TOLOWER(ch);
8902 else if (Py_UNICODE_ISLOWER(ch))
8903 nu = Py_UNICODE_TOUPPER(ch);
8904
8905 if (nu != 0) {
8906 if (nu > maxchar)
8907 maxchar = nu;
8908 PyUnicode_WRITE(kind, data, i, nu);
8909 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 else if (ch > maxchar)
8912 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 }
8914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 if (touched)
8916 return maxchar;
8917 else
8918 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919}
8920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008922fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8925 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8926 const int kind = PyUnicode_KIND(self);
8927 void *data = PyUnicode_DATA(self);
8928 int touched = 0;
8929 Py_UCS4 maxchar = 0;
8930 Py_ssize_t i = 0;
8931 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008932
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008933 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935
8936 ch = PyUnicode_READ(kind, data, i);
8937 if (!Py_UNICODE_ISUPPER(ch)) {
8938 maxchar = Py_UNICODE_TOUPPER(ch);
8939 PyUnicode_WRITE(kind, data, i, maxchar);
8940 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 ++i;
8943 for(; i < len; ++i) {
8944 ch = PyUnicode_READ(kind, data, i);
8945 if (!Py_UNICODE_ISLOWER(ch)) {
8946 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8947 if (lo > maxchar)
8948 maxchar = lo;
8949 PyUnicode_WRITE(kind, data, i, lo);
8950 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 else if (ch > maxchar)
8953 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955
8956 if (touched)
8957 return maxchar;
8958 else
8959 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008963fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8966 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8967 const int kind = PyUnicode_KIND(self);
8968 void *data = PyUnicode_DATA(self);
8969 Py_UCS4 maxchar = 0;
8970 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 int previous_is_cased;
8972
8973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 if (len == 1) {
8975 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8976 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8977 if (ti != ch) {
8978 PyUnicode_WRITE(kind, data, i, ti);
8979 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 }
8981 else
8982 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 for(; i < len; ++i) {
8986 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8987 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 nu = Py_UNICODE_TOTITLE(ch);
8993
8994 if (nu > maxchar)
8995 maxchar = nu;
8996 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 if (Py_UNICODE_ISLOWER(ch) ||
8999 Py_UNICODE_ISUPPER(ch) ||
9000 Py_UNICODE_ISTITLE(ch))
9001 previous_is_cased = 1;
9002 else
9003 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006}
9007
Tim Peters8ce9f162004-08-27 01:49:32 +00009008PyObject *
9009PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009012 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009014 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009015 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9016 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009017 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 Py_ssize_t sz, i, res_offset;
9019 Py_UCS4 maxchar = 0;
9020 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021
Tim Peters05eba1f2004-08-27 21:32:02 +00009022 fseq = PySequence_Fast(seq, "");
9023 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009024 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009025 }
9026
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009027 /* NOTE: the following code can't call back into Python code,
9028 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009029 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009030
Tim Peters05eba1f2004-08-27 21:32:02 +00009031 seqlen = PySequence_Fast_GET_SIZE(fseq);
9032 /* If empty sequence, return u"". */
9033 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009035 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00009036 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009037 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009038 /* If singleton sequence with an exact Unicode, return that. */
9039 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 item = items[0];
9041 if (PyUnicode_CheckExact(item)) {
9042 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 goto Done;
9045 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009046 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009047 else {
9048 /* Set up sep and seplen */
9049 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 /* fall back to a blank space separator */
9051 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02009052 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009054 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009055 else {
9056 if (!PyUnicode_Check(separator)) {
9057 PyErr_Format(PyExc_TypeError,
9058 "separator: expected str instance,"
9059 " %.80s found",
9060 Py_TYPE(separator)->tp_name);
9061 goto onError;
9062 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02009063 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 goto onError;
9065 sep = separator;
9066 seplen = PyUnicode_GET_LENGTH(separator);
9067 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
Georg Brandl7597add2011-10-05 16:36:47 +02009068 /* inc refcount to keep this code path symmetric with the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 above case of a blank separator */
9070 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009071 }
9072 }
9073
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009074 /* There are at least two things to join, or else we have a subclass
9075 * of str in the sequence.
9076 * Do a pre-pass to figure out the total amount of space we'll
9077 * need (sz), and see whether all argument are strings.
9078 */
9079 sz = 0;
9080 for (i = 0; i < seqlen; i++) {
9081 const Py_ssize_t old_sz = sz;
9082 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 if (!PyUnicode_Check(item)) {
9084 PyErr_Format(PyExc_TypeError,
9085 "sequence item %zd: expected str instance,"
9086 " %.80s found",
9087 i, Py_TYPE(item)->tp_name);
9088 goto onError;
9089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (PyUnicode_READY(item) == -1)
9091 goto onError;
9092 sz += PyUnicode_GET_LENGTH(item);
9093 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9094 if (item_maxchar > maxchar)
9095 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009096 if (i != 0)
9097 sz += seplen;
9098 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9099 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009101 goto onError;
9102 }
9103 }
Tim Petersced69f82003-09-16 20:30:58 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009106 if (res == NULL)
9107 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009108
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009109 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009111 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009112 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009114 if (i && seplen != 0) {
9115 copied = PyUnicode_CopyCharacters(res, res_offset,
9116 sep, 0, seplen);
9117 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009118 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009119#ifdef Py_DEBUG
9120 res_offset += copied;
9121#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009123#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009125 itemlen = PyUnicode_GET_LENGTH(item);
9126 if (itemlen != 0) {
9127 copied = PyUnicode_CopyCharacters(res, res_offset,
9128 item, 0, itemlen);
9129 if (copied < 0)
9130 goto onError;
9131#ifdef Py_DEBUG
9132 res_offset += copied;
9133#else
9134 res_offset += itemlen;
9135#endif
9136 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009139
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009141 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009143 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009147 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009149 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 return NULL;
9151}
9152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153#define FILL(kind, data, value, start, length) \
9154 do { \
9155 Py_ssize_t i_ = 0; \
9156 assert(kind != PyUnicode_WCHAR_KIND); \
9157 switch ((kind)) { \
9158 case PyUnicode_1BYTE_KIND: { \
9159 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9160 memset(to_, (unsigned char)value, length); \
9161 break; \
9162 } \
9163 case PyUnicode_2BYTE_KIND: { \
9164 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9165 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9166 break; \
9167 } \
9168 default: { \
9169 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9170 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9171 break; \
9172 } \
9173 } \
9174 } while (0)
9175
Victor Stinner9310abb2011-10-05 00:59:23 +02009176static PyObject *
9177pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009178 Py_ssize_t left,
9179 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 PyObject *u;
9183 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009184 int kind;
9185 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186
9187 if (left < 0)
9188 left = 0;
9189 if (right < 0)
9190 right = 0;
9191
Tim Peters7a29bd52001-09-12 03:03:31 +00009192 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 Py_INCREF(self);
9194 return self;
9195 }
9196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9198 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009199 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9200 return NULL;
9201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9203 if (fill > maxchar)
9204 maxchar = fill;
9205 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009206 if (!u)
9207 return NULL;
9208
9209 kind = PyUnicode_KIND(u);
9210 data = PyUnicode_DATA(u);
9211 if (left)
9212 FILL(kind, data, fill, 0, left);
9213 if (right)
9214 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009215 if (PyUnicode_CopyCharacters(u, left,
9216 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009217 _PyUnicode_LENGTH(self)) < 0)
9218 {
9219 Py_DECREF(u);
9220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 }
9222
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009223 assert(_PyUnicode_CheckConsistency(u, 1));
9224 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227
Alexander Belopolsky40018472011-02-26 01:02:56 +00009228PyObject *
9229PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232
9233 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 switch(PyUnicode_KIND(string)) {
9238 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009239 if (PyUnicode_IS_ASCII(string))
9240 list = asciilib_splitlines(
9241 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9242 PyUnicode_GET_LENGTH(string), keepends);
9243 else
9244 list = ucs1lib_splitlines(
9245 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9246 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 break;
9248 case PyUnicode_2BYTE_KIND:
9249 list = ucs2lib_splitlines(
9250 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9251 PyUnicode_GET_LENGTH(string), keepends);
9252 break;
9253 case PyUnicode_4BYTE_KIND:
9254 list = ucs4lib_splitlines(
9255 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9256 PyUnicode_GET_LENGTH(string), keepends);
9257 break;
9258 default:
9259 assert(0);
9260 list = 0;
9261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 Py_DECREF(string);
9263 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264}
9265
Alexander Belopolsky40018472011-02-26 01:02:56 +00009266static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009267split(PyObject *self,
9268 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009269 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 int kind1, kind2, kind;
9272 void *buf1, *buf2;
9273 Py_ssize_t len1, len2;
9274 PyObject* out;
9275
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009277 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 if (PyUnicode_READY(self) == -1)
9280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 if (substring == NULL)
9283 switch(PyUnicode_KIND(self)) {
9284 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009285 if (PyUnicode_IS_ASCII(self))
9286 return asciilib_split_whitespace(
9287 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9288 PyUnicode_GET_LENGTH(self), maxcount
9289 );
9290 else
9291 return ucs1lib_split_whitespace(
9292 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9293 PyUnicode_GET_LENGTH(self), maxcount
9294 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 case PyUnicode_2BYTE_KIND:
9296 return ucs2lib_split_whitespace(
9297 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9298 PyUnicode_GET_LENGTH(self), maxcount
9299 );
9300 case PyUnicode_4BYTE_KIND:
9301 return ucs4lib_split_whitespace(
9302 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9303 PyUnicode_GET_LENGTH(self), maxcount
9304 );
9305 default:
9306 assert(0);
9307 return NULL;
9308 }
9309
9310 if (PyUnicode_READY(substring) == -1)
9311 return NULL;
9312
9313 kind1 = PyUnicode_KIND(self);
9314 kind2 = PyUnicode_KIND(substring);
9315 kind = kind1 > kind2 ? kind1 : kind2;
9316 buf1 = PyUnicode_DATA(self);
9317 buf2 = PyUnicode_DATA(substring);
9318 if (kind1 != kind)
9319 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9320 if (!buf1)
9321 return NULL;
9322 if (kind2 != kind)
9323 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9324 if (!buf2) {
9325 if (kind1 != kind) PyMem_Free(buf1);
9326 return NULL;
9327 }
9328 len1 = PyUnicode_GET_LENGTH(self);
9329 len2 = PyUnicode_GET_LENGTH(substring);
9330
9331 switch(kind) {
9332 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009333 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9334 out = asciilib_split(
9335 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9336 else
9337 out = ucs1lib_split(
9338 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 break;
9340 case PyUnicode_2BYTE_KIND:
9341 out = ucs2lib_split(
9342 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9343 break;
9344 case PyUnicode_4BYTE_KIND:
9345 out = ucs4lib_split(
9346 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9347 break;
9348 default:
9349 out = NULL;
9350 }
9351 if (kind1 != kind)
9352 PyMem_Free(buf1);
9353 if (kind2 != kind)
9354 PyMem_Free(buf2);
9355 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356}
9357
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009359rsplit(PyObject *self,
9360 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009361 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 int kind1, kind2, kind;
9364 void *buf1, *buf2;
9365 Py_ssize_t len1, len2;
9366 PyObject* out;
9367
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009368 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009369 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (PyUnicode_READY(self) == -1)
9372 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (substring == NULL)
9375 switch(PyUnicode_KIND(self)) {
9376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009377 if (PyUnicode_IS_ASCII(self))
9378 return asciilib_rsplit_whitespace(
9379 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9380 PyUnicode_GET_LENGTH(self), maxcount
9381 );
9382 else
9383 return ucs1lib_rsplit_whitespace(
9384 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9385 PyUnicode_GET_LENGTH(self), maxcount
9386 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 case PyUnicode_2BYTE_KIND:
9388 return ucs2lib_rsplit_whitespace(
9389 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9390 PyUnicode_GET_LENGTH(self), maxcount
9391 );
9392 case PyUnicode_4BYTE_KIND:
9393 return ucs4lib_rsplit_whitespace(
9394 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9395 PyUnicode_GET_LENGTH(self), maxcount
9396 );
9397 default:
9398 assert(0);
9399 return NULL;
9400 }
9401
9402 if (PyUnicode_READY(substring) == -1)
9403 return NULL;
9404
9405 kind1 = PyUnicode_KIND(self);
9406 kind2 = PyUnicode_KIND(substring);
9407 kind = kind1 > kind2 ? kind1 : kind2;
9408 buf1 = PyUnicode_DATA(self);
9409 buf2 = PyUnicode_DATA(substring);
9410 if (kind1 != kind)
9411 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9412 if (!buf1)
9413 return NULL;
9414 if (kind2 != kind)
9415 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9416 if (!buf2) {
9417 if (kind1 != kind) PyMem_Free(buf1);
9418 return NULL;
9419 }
9420 len1 = PyUnicode_GET_LENGTH(self);
9421 len2 = PyUnicode_GET_LENGTH(substring);
9422
9423 switch(kind) {
9424 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009425 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9426 out = asciilib_rsplit(
9427 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9428 else
9429 out = ucs1lib_rsplit(
9430 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 break;
9432 case PyUnicode_2BYTE_KIND:
9433 out = ucs2lib_rsplit(
9434 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9435 break;
9436 case PyUnicode_4BYTE_KIND:
9437 out = ucs4lib_rsplit(
9438 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9439 break;
9440 default:
9441 out = NULL;
9442 }
9443 if (kind1 != kind)
9444 PyMem_Free(buf1);
9445 if (kind2 != kind)
9446 PyMem_Free(buf2);
9447 return out;
9448}
9449
9450static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009451anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9452 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453{
9454 switch(kind) {
9455 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009456 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9457 return asciilib_find(buf1, len1, buf2, len2, offset);
9458 else
9459 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 case PyUnicode_2BYTE_KIND:
9461 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9462 case PyUnicode_4BYTE_KIND:
9463 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9464 }
9465 assert(0);
9466 return -1;
9467}
9468
9469static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009470anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9471 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472{
9473 switch(kind) {
9474 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009475 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9476 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9477 else
9478 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 case PyUnicode_2BYTE_KIND:
9480 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9481 case PyUnicode_4BYTE_KIND:
9482 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9483 }
9484 assert(0);
9485 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009486}
9487
Alexander Belopolsky40018472011-02-26 01:02:56 +00009488static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489replace(PyObject *self, PyObject *str1,
9490 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 PyObject *u;
9493 char *sbuf = PyUnicode_DATA(self);
9494 char *buf1 = PyUnicode_DATA(str1);
9495 char *buf2 = PyUnicode_DATA(str2);
9496 int srelease = 0, release1 = 0, release2 = 0;
9497 int skind = PyUnicode_KIND(self);
9498 int kind1 = PyUnicode_KIND(str1);
9499 int kind2 = PyUnicode_KIND(str2);
9500 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9501 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9502 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503
9504 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009507 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 if (skind < kind1)
9510 /* substring too wide to be present */
9511 goto nothing;
9512
9513 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009514 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009515 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009517 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009519 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 Py_UCS4 u1, u2, maxchar;
9521 int mayshrink, rkind;
9522 u1 = PyUnicode_READ_CHAR(str1, 0);
9523 if (!findchar(sbuf, PyUnicode_KIND(self),
9524 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009525 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 u2 = PyUnicode_READ_CHAR(str2, 0);
9527 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9528 /* Replacing u1 with u2 may cause a maxchar reduction in the
9529 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 if (u2 > maxchar) {
9531 maxchar = u2;
9532 mayshrink = 0;
9533 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009534 else
9535 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009537 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009539 if (PyUnicode_CopyCharacters(u, 0,
9540 (PyObject*)self, 0, slen) < 0)
9541 {
9542 Py_DECREF(u);
9543 return NULL;
9544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 rkind = PyUnicode_KIND(u);
9546 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9547 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009548 if (--maxcount < 0)
9549 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 if (mayshrink) {
9553 PyObject *tmp = u;
9554 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9555 PyUnicode_GET_LENGTH(tmp));
9556 Py_DECREF(tmp);
9557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 int rkind = skind;
9560 char *res;
9561 if (kind1 < rkind) {
9562 /* widen substring */
9563 buf1 = _PyUnicode_AsKind(str1, rkind);
9564 if (!buf1) goto error;
9565 release1 = 1;
9566 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009567 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009568 if (i < 0)
9569 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 if (rkind > kind2) {
9571 /* widen replacement */
9572 buf2 = _PyUnicode_AsKind(str2, rkind);
9573 if (!buf2) goto error;
9574 release2 = 1;
9575 }
9576 else if (rkind < kind2) {
9577 /* widen self and buf1 */
9578 rkind = kind2;
9579 if (release1) PyMem_Free(buf1);
9580 sbuf = _PyUnicode_AsKind(self, rkind);
9581 if (!sbuf) goto error;
9582 srelease = 1;
9583 buf1 = _PyUnicode_AsKind(str1, rkind);
9584 if (!buf1) goto error;
9585 release1 = 1;
9586 }
9587 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9588 if (!res) {
9589 PyErr_NoMemory();
9590 goto error;
9591 }
9592 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009593 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9595 buf2,
9596 PyUnicode_KIND_SIZE(rkind, len2));
9597 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009598
9599 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009600 i = anylib_find(rkind, self,
9601 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9602 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009603 if (i == -1)
9604 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9606 buf2,
9607 PyUnicode_KIND_SIZE(rkind, len2));
9608 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610
9611 u = PyUnicode_FromKindAndData(rkind, res, slen);
9612 PyMem_Free(res);
9613 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 Py_ssize_t n, i, j, ires;
9618 Py_ssize_t product, new_size;
9619 int rkind = skind;
9620 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 if (kind1 < rkind) {
9623 buf1 = _PyUnicode_AsKind(str1, rkind);
9624 if (!buf1) goto error;
9625 release1 = 1;
9626 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009627 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009628 if (n == 0)
9629 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 if (kind2 < rkind) {
9631 buf2 = _PyUnicode_AsKind(str2, rkind);
9632 if (!buf2) goto error;
9633 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 else if (kind2 > rkind) {
9636 rkind = kind2;
9637 sbuf = _PyUnicode_AsKind(self, rkind);
9638 if (!sbuf) goto error;
9639 srelease = 1;
9640 if (release1) PyMem_Free(buf1);
9641 buf1 = _PyUnicode_AsKind(str1, rkind);
9642 if (!buf1) goto error;
9643 release1 = 1;
9644 }
9645 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9646 PyUnicode_GET_LENGTH(str1))); */
9647 product = n * (len2-len1);
9648 if ((product / (len2-len1)) != n) {
9649 PyErr_SetString(PyExc_OverflowError,
9650 "replace string is too long");
9651 goto error;
9652 }
9653 new_size = slen + product;
9654 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9655 PyErr_SetString(PyExc_OverflowError,
9656 "replace string is too long");
9657 goto error;
9658 }
9659 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9660 if (!res)
9661 goto error;
9662 ires = i = 0;
9663 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009664 while (n-- > 0) {
9665 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009666 j = anylib_find(rkind, self,
9667 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9668 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009669 if (j == -1)
9670 break;
9671 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009672 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9674 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9675 PyUnicode_KIND_SIZE(rkind, j-i));
9676 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009677 }
9678 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 if (len2 > 0) {
9680 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9681 buf2,
9682 PyUnicode_KIND_SIZE(rkind, len2));
9683 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009688 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9690 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9691 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009692 } else {
9693 /* interleave */
9694 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9696 buf2,
9697 PyUnicode_KIND_SIZE(rkind, len2));
9698 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009699 if (--n <= 0)
9700 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9702 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9703 PyUnicode_KIND_SIZE(rkind, 1));
9704 ires++;
9705 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9708 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9709 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009710 }
Victor Stinnerf48323e2011-10-05 23:27:08 +02009711 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(str2))
9712 u = unicode_fromascii((unsigned char*)res, new_size);
9713 else
9714 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009715 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 if (srelease)
9718 PyMem_FREE(sbuf);
9719 if (release1)
9720 PyMem_FREE(buf1);
9721 if (release2)
9722 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009723 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009725
Benjamin Peterson29060642009-01-31 22:14:21 +00009726 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009727 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 if (srelease)
9729 PyMem_FREE(sbuf);
9730 if (release1)
9731 PyMem_FREE(buf1);
9732 if (release2)
9733 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009734 if (PyUnicode_CheckExact(self)) {
9735 Py_INCREF(self);
9736 return (PyObject *) self;
9737 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009738 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 error:
9740 if (srelease && sbuf)
9741 PyMem_FREE(sbuf);
9742 if (release1 && buf1)
9743 PyMem_FREE(buf1);
9744 if (release2 && buf2)
9745 PyMem_FREE(buf2);
9746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747}
9748
9749/* --- Unicode Object Methods --------------------------------------------- */
9750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009751PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009752 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753\n\
9754Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009755characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756
9757static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009758unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 return fixup(self, fixtitle);
9761}
9762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009763PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765\n\
9766Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009767have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768
9769static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009770unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772 return fixup(self, fixcapitalize);
9773}
9774
9775#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009776PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009777 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778\n\
9779Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009780normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781
9782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009783unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784{
9785 PyObject *list;
9786 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009787 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789 /* Split into words */
9790 list = split(self, NULL, -1);
9791 if (!list)
9792 return NULL;
9793
9794 /* Capitalize each word */
9795 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9796 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798 if (item == NULL)
9799 goto onError;
9800 Py_DECREF(PyList_GET_ITEM(list, i));
9801 PyList_SET_ITEM(list, i, item);
9802 }
9803
9804 /* Join the words to form a new string */
9805 item = PyUnicode_Join(NULL, list);
9806
Benjamin Peterson29060642009-01-31 22:14:21 +00009807 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 Py_DECREF(list);
9809 return (PyObject *)item;
9810}
9811#endif
9812
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009813/* Argument converter. Coerces to a single unicode character */
9814
9815static int
9816convert_uc(PyObject *obj, void *addr)
9817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009819 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009820
Benjamin Peterson14339b62009-01-31 16:36:08 +00009821 uniobj = PyUnicode_FromObject(obj);
9822 if (uniobj == NULL) {
9823 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009824 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009825 return 0;
9826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009828 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009829 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 Py_DECREF(uniobj);
9831 return 0;
9832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 Py_DECREF(uniobj);
9835 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009836}
9837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009838PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009839 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009841Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009842done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843
9844static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009845unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009847 Py_ssize_t marg, left;
9848 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 Py_UCS4 fillchar = ' ';
9850
Victor Stinnere9a29352011-10-01 02:14:59 +02009851 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853
Victor Stinnere9a29352011-10-01 02:14:59 +02009854 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855 return NULL;
9856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858 Py_INCREF(self);
9859 return (PyObject*) self;
9860 }
9861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863 left = marg / 2 + (marg & width & 1);
9864
Victor Stinner9310abb2011-10-05 00:59:23 +02009865 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866}
9867
Marc-André Lemburge5034372000-08-08 08:04:29 +00009868#if 0
9869
9870/* This code should go into some future Unicode collation support
9871 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009872 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009873
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009874/* speedy UTF-16 code point order comparison */
9875/* gleaned from: */
9876/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9877
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009878static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009879{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009880 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009881 0, 0, 0, 0, 0, 0, 0, 0,
9882 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009883 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009884};
9885
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886static int
9887unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009889 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009890
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 Py_UNICODE *s1 = str1->str;
9892 Py_UNICODE *s2 = str2->str;
9893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 len1 = str1->_base._base.length;
9895 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009896
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009898 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009899
9900 c1 = *s1++;
9901 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009902
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 if (c1 > (1<<11) * 26)
9904 c1 += utf16Fixup[c1>>11];
9905 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009906 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009907 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009908
9909 if (c1 != c2)
9910 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009911
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009912 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913 }
9914
9915 return (len1 < len2) ? -1 : (len1 != len2);
9916}
9917
Marc-André Lemburge5034372000-08-08 08:04:29 +00009918#else
9919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920/* This function assumes that str1 and str2 are readied by the caller. */
9921
Marc-André Lemburge5034372000-08-08 08:04:29 +00009922static int
9923unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 int kind1, kind2;
9926 void *data1, *data2;
9927 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 kind1 = PyUnicode_KIND(str1);
9930 kind2 = PyUnicode_KIND(str2);
9931 data1 = PyUnicode_DATA(str1);
9932 data2 = PyUnicode_DATA(str2);
9933 len1 = PyUnicode_GET_LENGTH(str1);
9934 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 for (i = 0; i < len1 && i < len2; ++i) {
9937 Py_UCS4 c1, c2;
9938 c1 = PyUnicode_READ(kind1, data1, i);
9939 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009940
9941 if (c1 != c2)
9942 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009943 }
9944
9945 return (len1 < len2) ? -1 : (len1 != len2);
9946}
9947
9948#endif
9949
Alexander Belopolsky40018472011-02-26 01:02:56 +00009950int
9951PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9954 if (PyUnicode_READY(left) == -1 ||
9955 PyUnicode_READY(right) == -1)
9956 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009957 return unicode_compare((PyUnicodeObject *)left,
9958 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009960 PyErr_Format(PyExc_TypeError,
9961 "Can't compare %.100s and %.100s",
9962 left->ob_type->tp_name,
9963 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964 return -1;
9965}
9966
Martin v. Löwis5b222132007-06-10 09:51:05 +00009967int
9968PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 Py_ssize_t i;
9971 int kind;
9972 void *data;
9973 Py_UCS4 chr;
9974
Victor Stinner910337b2011-10-03 03:20:16 +02009975 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (PyUnicode_READY(uni) == -1)
9977 return -1;
9978 kind = PyUnicode_KIND(uni);
9979 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009980 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9982 if (chr != str[i])
9983 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009984 /* This check keeps Python strings that end in '\0' from comparing equal
9985 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009988 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009990 return 0;
9991}
9992
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009993
Benjamin Peterson29060642009-01-31 22:14:21 +00009994#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009995 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009996
Alexander Belopolsky40018472011-02-26 01:02:56 +00009997PyObject *
9998PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009999{
10000 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010002 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10003 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (PyUnicode_READY(left) == -1 ||
10005 PyUnicode_READY(right) == -1)
10006 return NULL;
10007 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10008 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010009 if (op == Py_EQ) {
10010 Py_INCREF(Py_False);
10011 return Py_False;
10012 }
10013 if (op == Py_NE) {
10014 Py_INCREF(Py_True);
10015 return Py_True;
10016 }
10017 }
10018 if (left == right)
10019 result = 0;
10020 else
10021 result = unicode_compare((PyUnicodeObject *)left,
10022 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010023
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010024 /* Convert the return value to a Boolean */
10025 switch (op) {
10026 case Py_EQ:
10027 v = TEST_COND(result == 0);
10028 break;
10029 case Py_NE:
10030 v = TEST_COND(result != 0);
10031 break;
10032 case Py_LE:
10033 v = TEST_COND(result <= 0);
10034 break;
10035 case Py_GE:
10036 v = TEST_COND(result >= 0);
10037 break;
10038 case Py_LT:
10039 v = TEST_COND(result == -1);
10040 break;
10041 case Py_GT:
10042 v = TEST_COND(result == 1);
10043 break;
10044 default:
10045 PyErr_BadArgument();
10046 return NULL;
10047 }
10048 Py_INCREF(v);
10049 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010050 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010051
Brian Curtindfc80e32011-08-10 20:28:54 -050010052 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010053}
10054
Alexander Belopolsky40018472011-02-26 01:02:56 +000010055int
10056PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010057{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 int kind1, kind2, kind;
10060 void *buf1, *buf2;
10061 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010062 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010063
10064 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010065 sub = PyUnicode_FromObject(element);
10066 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010067 PyErr_Format(PyExc_TypeError,
10068 "'in <string>' requires string as left operand, not %s",
10069 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010070 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 if (PyUnicode_READY(sub) == -1)
10073 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010074
Thomas Wouters477c8d52006-05-27 19:21:47 +000010075 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010076 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010077 Py_DECREF(sub);
10078 return -1;
10079 }
10080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 kind1 = PyUnicode_KIND(str);
10082 kind2 = PyUnicode_KIND(sub);
10083 kind = kind1 > kind2 ? kind1 : kind2;
10084 buf1 = PyUnicode_DATA(str);
10085 buf2 = PyUnicode_DATA(sub);
10086 if (kind1 != kind)
10087 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10088 if (!buf1) {
10089 Py_DECREF(sub);
10090 return -1;
10091 }
10092 if (kind2 != kind)
10093 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10094 if (!buf2) {
10095 Py_DECREF(sub);
10096 if (kind1 != kind) PyMem_Free(buf1);
10097 return -1;
10098 }
10099 len1 = PyUnicode_GET_LENGTH(str);
10100 len2 = PyUnicode_GET_LENGTH(sub);
10101
10102 switch(kind) {
10103 case PyUnicode_1BYTE_KIND:
10104 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10105 break;
10106 case PyUnicode_2BYTE_KIND:
10107 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10108 break;
10109 case PyUnicode_4BYTE_KIND:
10110 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10111 break;
10112 default:
10113 result = -1;
10114 assert(0);
10115 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010116
10117 Py_DECREF(str);
10118 Py_DECREF(sub);
10119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (kind1 != kind)
10121 PyMem_Free(buf1);
10122 if (kind2 != kind)
10123 PyMem_Free(buf2);
10124
Guido van Rossum403d68b2000-03-13 15:55:09 +000010125 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010126}
10127
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128/* Concat to string or Unicode object giving a new Unicode object. */
10129
Alexander Belopolsky40018472011-02-26 01:02:56 +000010130PyObject *
10131PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 PyObject *u = NULL, *v = NULL, *w;
10134 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135
10136 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143
10144 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010145 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010146 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010149 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152 }
10153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010155 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 w = PyUnicode_New(
10159 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10160 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010162 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010163 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10164 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010165 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010166 v, 0,
10167 PyUnicode_GET_LENGTH(v)) < 0)
10168 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169 Py_DECREF(u);
10170 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010171 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 Py_XDECREF(u);
10176 Py_XDECREF(v);
10177 return NULL;
10178}
10179
Victor Stinnerb0923652011-10-04 01:17:31 +020010180static void
10181unicode_append_inplace(PyObject **p_left, PyObject *right)
10182{
10183 Py_ssize_t left_len, right_len, new_len;
10184#ifdef Py_DEBUG
10185 Py_ssize_t copied;
10186#endif
10187
10188 assert(PyUnicode_IS_READY(*p_left));
10189 assert(PyUnicode_IS_READY(right));
10190
10191 left_len = PyUnicode_GET_LENGTH(*p_left);
10192 right_len = PyUnicode_GET_LENGTH(right);
10193 if (left_len > PY_SSIZE_T_MAX - right_len) {
10194 PyErr_SetString(PyExc_OverflowError,
10195 "strings are too large to concat");
10196 goto error;
10197 }
10198 new_len = left_len + right_len;
10199
10200 /* Now we own the last reference to 'left', so we can resize it
10201 * in-place.
10202 */
10203 if (unicode_resize(p_left, new_len) != 0) {
10204 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10205 * deallocated so it cannot be put back into
10206 * 'variable'. The MemoryError is raised when there
10207 * is no value in 'variable', which might (very
10208 * remotely) be a cause of incompatibilities.
10209 */
10210 goto error;
10211 }
10212 /* copy 'right' into the newly allocated area of 'left' */
10213#ifdef Py_DEBUG
10214 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10215 right, 0,
10216 right_len);
10217 assert(0 <= copied);
10218#else
10219 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10220#endif
10221 return;
10222
10223error:
10224 Py_DECREF(*p_left);
10225 *p_left = NULL;
10226}
10227
Walter Dörwald1ab83302007-05-18 17:15:44 +000010228void
Victor Stinner23e56682011-10-03 03:54:37 +020010229PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010230{
Victor Stinner23e56682011-10-03 03:54:37 +020010231 PyObject *left, *res;
10232
10233 if (p_left == NULL) {
10234 if (!PyErr_Occurred())
10235 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 return;
10237 }
Victor Stinner23e56682011-10-03 03:54:37 +020010238 left = *p_left;
10239 if (right == NULL || !PyUnicode_Check(left)) {
10240 if (!PyErr_Occurred())
10241 PyErr_BadInternalCall();
10242 goto error;
10243 }
10244
Victor Stinnere1335c72011-10-04 20:53:03 +020010245 if (PyUnicode_READY(left))
10246 goto error;
10247 if (PyUnicode_READY(right))
10248 goto error;
10249
Victor Stinner23e56682011-10-03 03:54:37 +020010250 if (PyUnicode_CheckExact(left) && left != unicode_empty
10251 && PyUnicode_CheckExact(right) && right != unicode_empty
10252 && unicode_resizable(left)
10253 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10254 || _PyUnicode_WSTR(left) != NULL))
10255 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010256 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10257 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010258 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010259 not so different than duplicating the string. */
10260 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010261 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010262 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010263 if (p_left != NULL)
10264 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010265 return;
10266 }
10267 }
10268
10269 res = PyUnicode_Concat(left, right);
10270 if (res == NULL)
10271 goto error;
10272 Py_DECREF(left);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010273 assert(_PyUnicode_CheckConsistency(res, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010274 *p_left = res;
10275 return;
10276
10277error:
10278 Py_DECREF(*p_left);
10279 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010280}
10281
10282void
10283PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10284{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010285 PyUnicode_Append(pleft, right);
10286 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010287}
10288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010289PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010293string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010294interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
10296static PyObject *
10297unicode_count(PyUnicodeObject *self, PyObject *args)
10298{
10299 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010300 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010301 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 int kind1, kind2, kind;
10304 void *buf1, *buf2;
10305 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306
Jesus Ceaac451502011-04-20 17:09:23 +020010307 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10308 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 kind1 = PyUnicode_KIND(self);
10312 kind2 = PyUnicode_KIND(substring);
10313 kind = kind1 > kind2 ? kind1 : kind2;
10314 buf1 = PyUnicode_DATA(self);
10315 buf2 = PyUnicode_DATA(substring);
10316 if (kind1 != kind)
10317 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10318 if (!buf1) {
10319 Py_DECREF(substring);
10320 return NULL;
10321 }
10322 if (kind2 != kind)
10323 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10324 if (!buf2) {
10325 Py_DECREF(substring);
10326 if (kind1 != kind) PyMem_Free(buf1);
10327 return NULL;
10328 }
10329 len1 = PyUnicode_GET_LENGTH(self);
10330 len2 = PyUnicode_GET_LENGTH(substring);
10331
10332 ADJUST_INDICES(start, end, len1);
10333 switch(kind) {
10334 case PyUnicode_1BYTE_KIND:
10335 iresult = ucs1lib_count(
10336 ((Py_UCS1*)buf1) + start, end - start,
10337 buf2, len2, PY_SSIZE_T_MAX
10338 );
10339 break;
10340 case PyUnicode_2BYTE_KIND:
10341 iresult = ucs2lib_count(
10342 ((Py_UCS2*)buf1) + start, end - start,
10343 buf2, len2, PY_SSIZE_T_MAX
10344 );
10345 break;
10346 case PyUnicode_4BYTE_KIND:
10347 iresult = ucs4lib_count(
10348 ((Py_UCS4*)buf1) + start, end - start,
10349 buf2, len2, PY_SSIZE_T_MAX
10350 );
10351 break;
10352 default:
10353 assert(0); iresult = 0;
10354 }
10355
10356 result = PyLong_FromSsize_t(iresult);
10357
10358 if (kind1 != kind)
10359 PyMem_Free(buf1);
10360 if (kind2 != kind)
10361 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362
10363 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010364
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 return result;
10366}
10367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010368PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010369 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010371Encode S using the codec registered for encoding. Default encoding\n\
10372is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010373handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010374a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10375'xmlcharrefreplace' as well as any other name registered with\n\
10376codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
10378static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010379unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010381 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 char *encoding = NULL;
10383 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010384
Benjamin Peterson308d6372009-09-18 21:42:35 +000010385 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10386 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010388 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010389}
10390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010391PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393\n\
10394Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397static PyObject*
10398unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10399{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010400 Py_ssize_t i, j, line_pos, src_len, incr;
10401 Py_UCS4 ch;
10402 PyObject *u;
10403 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010405 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010406 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
10408 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
Antoine Pitrou22425222011-10-04 19:10:51 +020010411 if (PyUnicode_READY(self) == -1)
10412 return NULL;
10413
Thomas Wouters7e474022000-07-16 12:04:32 +000010414 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010415 src_len = PyUnicode_GET_LENGTH(self);
10416 i = j = line_pos = 0;
10417 kind = PyUnicode_KIND(self);
10418 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010419 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010420 for (; i < src_len; i++) {
10421 ch = PyUnicode_READ(kind, src_data, i);
10422 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010423 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010425 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010427 goto overflow;
10428 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010430 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010434 goto overflow;
10435 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010437 if (ch == '\n' || ch == '\r')
10438 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010440 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010441 if (!found && PyUnicode_CheckExact(self)) {
10442 Py_INCREF((PyObject *) self);
10443 return (PyObject *) self;
10444 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010445
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010447 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448 if (!u)
10449 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010450 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
Antoine Pitroue71d5742011-10-04 15:55:09 +020010452 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453
Antoine Pitroue71d5742011-10-04 15:55:09 +020010454 for (; i < src_len; i++) {
10455 ch = PyUnicode_READ(kind, src_data, i);
10456 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010457 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010458 incr = tabsize - (line_pos % tabsize);
10459 line_pos += incr;
10460 while (incr--) {
10461 PyUnicode_WRITE(kind, dest_data, j, ' ');
10462 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010463 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010465 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010467 line_pos++;
10468 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010469 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010470 if (ch == '\n' || ch == '\r')
10471 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010473 }
10474 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010475#ifndef DONT_MAKE_RESULT_READY
10476 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 Py_DECREF(u);
10478 return NULL;
10479 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010480#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010481 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010483
Antoine Pitroue71d5742011-10-04 15:55:09 +020010484 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010485 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487}
10488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010489PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010490 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491\n\
10492Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010493such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494arguments start and end are interpreted as in slice notation.\n\
10495\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010496Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497
10498static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500{
Jesus Ceaac451502011-04-20 17:09:23 +020010501 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010502 Py_ssize_t start;
10503 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
Jesus Ceaac451502011-04-20 17:09:23 +020010506 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10507 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (PyUnicode_READY(self) == -1)
10511 return NULL;
10512 if (PyUnicode_READY(substring) == -1)
10513 return NULL;
10514
10515 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 asciilib_find_slice, ucs1lib_find_slice,
10517 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
10521 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (result == -2)
10524 return NULL;
10525
Christian Heimes217cfd12007-12-02 14:31:20 +000010526 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527}
10528
10529static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010530unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010532 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10533 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536}
10537
Guido van Rossumc2504932007-09-18 19:42:40 +000010538/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010539 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010540static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010541unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542{
Guido van Rossumc2504932007-09-18 19:42:40 +000010543 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010544 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (_PyUnicode_HASH(self) != -1)
10547 return _PyUnicode_HASH(self);
10548 if (PyUnicode_READY(self) == -1)
10549 return -1;
10550 len = PyUnicode_GET_LENGTH(self);
10551
10552 /* The hash function as a macro, gets expanded three times below. */
10553#define HASH(P) \
10554 x = (Py_uhash_t)*P << 7; \
10555 while (--len >= 0) \
10556 x = (1000003*x) ^ (Py_uhash_t)*P++;
10557
10558 switch (PyUnicode_KIND(self)) {
10559 case PyUnicode_1BYTE_KIND: {
10560 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10561 HASH(c);
10562 break;
10563 }
10564 case PyUnicode_2BYTE_KIND: {
10565 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10566 HASH(s);
10567 break;
10568 }
10569 default: {
10570 Py_UCS4 *l;
10571 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10572 "Impossible switch case in unicode_hash");
10573 l = PyUnicode_4BYTE_DATA(self);
10574 HASH(l);
10575 break;
10576 }
10577 }
10578 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10579
Guido van Rossumc2504932007-09-18 19:42:40 +000010580 if (x == -1)
10581 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010583 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010587PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010590Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591
10592static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010595 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010596 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010597 Py_ssize_t start;
10598 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
Jesus Ceaac451502011-04-20 17:09:23 +020010600 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10601 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 if (PyUnicode_READY(self) == -1)
10605 return NULL;
10606 if (PyUnicode_READY(substring) == -1)
10607 return NULL;
10608
10609 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010610 asciilib_find_slice, ucs1lib_find_slice,
10611 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614
10615 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (result == -2)
10618 return NULL;
10619
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 if (result < 0) {
10621 PyErr_SetString(PyExc_ValueError, "substring not found");
10622 return NULL;
10623 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624
Christian Heimes217cfd12007-12-02 14:31:20 +000010625 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626}
10627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010628PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010631Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010632at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633
10634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010635unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 Py_ssize_t i, length;
10638 int kind;
10639 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640 int cased;
10641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (PyUnicode_READY(self) == -1)
10643 return NULL;
10644 length = PyUnicode_GET_LENGTH(self);
10645 kind = PyUnicode_KIND(self);
10646 data = PyUnicode_DATA(self);
10647
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 if (length == 1)
10650 return PyBool_FromLong(
10651 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010653 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010655 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010656
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 for (i = 0; i < length; i++) {
10659 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010660
Benjamin Peterson29060642009-01-31 22:14:21 +000010661 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10662 return PyBool_FromLong(0);
10663 else if (!cased && Py_UNICODE_ISLOWER(ch))
10664 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010665 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010666 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667}
10668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010669PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010672Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010673at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674
10675static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010676unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 Py_ssize_t i, length;
10679 int kind;
10680 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681 int cased;
10682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 if (PyUnicode_READY(self) == -1)
10684 return NULL;
10685 length = PyUnicode_GET_LENGTH(self);
10686 kind = PyUnicode_KIND(self);
10687 data = PyUnicode_DATA(self);
10688
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (length == 1)
10691 return PyBool_FromLong(
10692 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010694 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010696 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010697
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 for (i = 0; i < length; i++) {
10700 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010701
Benjamin Peterson29060642009-01-31 22:14:21 +000010702 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10703 return PyBool_FromLong(0);
10704 else if (!cased && Py_UNICODE_ISUPPER(ch))
10705 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010707 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708}
10709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010710PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010711 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010713Return True if S is a titlecased string and there is at least one\n\
10714character in S, i.e. upper- and titlecase characters may only\n\
10715follow uncased characters and lowercase characters only cased ones.\n\
10716Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010719unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 Py_ssize_t i, length;
10722 int kind;
10723 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 int cased, previous_is_cased;
10725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 if (PyUnicode_READY(self) == -1)
10727 return NULL;
10728 length = PyUnicode_GET_LENGTH(self);
10729 kind = PyUnicode_KIND(self);
10730 data = PyUnicode_DATA(self);
10731
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (length == 1) {
10734 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10735 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10736 (Py_UNICODE_ISUPPER(ch) != 0));
10737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010739 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010742
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 cased = 0;
10744 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 for (i = 0; i < length; i++) {
10746 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010747
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10749 if (previous_is_cased)
10750 return PyBool_FromLong(0);
10751 previous_is_cased = 1;
10752 cased = 1;
10753 }
10754 else if (Py_UNICODE_ISLOWER(ch)) {
10755 if (!previous_is_cased)
10756 return PyBool_FromLong(0);
10757 previous_is_cased = 1;
10758 cased = 1;
10759 }
10760 else
10761 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010763 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764}
10765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010766PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010769Return True if all characters in S are whitespace\n\
10770and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771
10772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010773unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 Py_ssize_t i, length;
10776 int kind;
10777 void *data;
10778
10779 if (PyUnicode_READY(self) == -1)
10780 return NULL;
10781 length = PyUnicode_GET_LENGTH(self);
10782 kind = PyUnicode_KIND(self);
10783 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (length == 1)
10787 return PyBool_FromLong(
10788 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010790 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 for (i = 0; i < length; i++) {
10795 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010796 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010799 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800}
10801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010802PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010803 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010804\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010805Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010806and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010807
10808static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010809unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 Py_ssize_t i, length;
10812 int kind;
10813 void *data;
10814
10815 if (PyUnicode_READY(self) == -1)
10816 return NULL;
10817 length = PyUnicode_GET_LENGTH(self);
10818 kind = PyUnicode_KIND(self);
10819 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010820
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010821 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 if (length == 1)
10823 return PyBool_FromLong(
10824 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010825
10826 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010828 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010830 for (i = 0; i < length; i++) {
10831 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010832 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010833 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010834 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010835}
10836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010837PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010838 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010839\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010840Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010841and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010842
10843static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010844unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 int kind;
10847 void *data;
10848 Py_ssize_t len, i;
10849
10850 if (PyUnicode_READY(self) == -1)
10851 return NULL;
10852
10853 kind = PyUnicode_KIND(self);
10854 data = PyUnicode_DATA(self);
10855 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010856
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010857 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (len == 1) {
10859 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10860 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10861 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010862
10863 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 for (i = 0; i < len; i++) {
10868 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010869 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010872 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010873}
10874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010875PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010878Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010879False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880
10881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010882unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 Py_ssize_t i, length;
10885 int kind;
10886 void *data;
10887
10888 if (PyUnicode_READY(self) == -1)
10889 return NULL;
10890 length = PyUnicode_GET_LENGTH(self);
10891 kind = PyUnicode_KIND(self);
10892 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010895 if (length == 1)
10896 return PyBool_FromLong(
10897 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010899 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 for (i = 0; i < length; i++) {
10904 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010907 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908}
10909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010910PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010913Return True if all characters in S are digits\n\
10914and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
10916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010917unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 Py_ssize_t i, length;
10920 int kind;
10921 void *data;
10922
10923 if (PyUnicode_READY(self) == -1)
10924 return NULL;
10925 length = PyUnicode_GET_LENGTH(self);
10926 kind = PyUnicode_KIND(self);
10927 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (length == 1) {
10931 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10932 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010935 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010937 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939 for (i = 0; i < length; i++) {
10940 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010941 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010943 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944}
10945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010946PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010949Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010950False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
10952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010953unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 Py_ssize_t i, length;
10956 int kind;
10957 void *data;
10958
10959 if (PyUnicode_READY(self) == -1)
10960 return NULL;
10961 length = PyUnicode_GET_LENGTH(self);
10962 kind = PyUnicode_KIND(self);
10963 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (length == 1)
10967 return PyBool_FromLong(
10968 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010970 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010972 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 for (i = 0; i < length; i++) {
10975 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010978 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979}
10980
Martin v. Löwis47383402007-08-15 07:32:56 +000010981int
10982PyUnicode_IsIdentifier(PyObject *self)
10983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 int kind;
10985 void *data;
10986 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010987 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (PyUnicode_READY(self) == -1) {
10990 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 }
10993
10994 /* Special case for empty strings */
10995 if (PyUnicode_GET_LENGTH(self) == 0)
10996 return 0;
10997 kind = PyUnicode_KIND(self);
10998 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010999
11000 /* PEP 3131 says that the first character must be in
11001 XID_Start and subsequent characters in XID_Continue,
11002 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011003 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011004 letters, digits, underscore). However, given the current
11005 definition of XID_Start and XID_Continue, it is sufficient
11006 to check just for these, except that _ must be allowed
11007 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011009 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011010 return 0;
11011
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011012 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011015 return 1;
11016}
11017
11018PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011020\n\
11021Return True if S is a valid identifier according\n\
11022to the language definition.");
11023
11024static PyObject*
11025unicode_isidentifier(PyObject *self)
11026{
11027 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11028}
11029
Georg Brandl559e5d72008-06-11 18:37:52 +000011030PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011032\n\
11033Return True if all characters in S are considered\n\
11034printable in repr() or S is empty, False otherwise.");
11035
11036static PyObject*
11037unicode_isprintable(PyObject *self)
11038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 Py_ssize_t i, length;
11040 int kind;
11041 void *data;
11042
11043 if (PyUnicode_READY(self) == -1)
11044 return NULL;
11045 length = PyUnicode_GET_LENGTH(self);
11046 kind = PyUnicode_KIND(self);
11047 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011048
11049 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 if (length == 1)
11051 return PyBool_FromLong(
11052 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 for (i = 0; i < length; i++) {
11055 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011056 Py_RETURN_FALSE;
11057 }
11058 }
11059 Py_RETURN_TRUE;
11060}
11061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011062PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011063 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064\n\
11065Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011066iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011069unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011071 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072}
11073
Martin v. Löwis18e16552006-02-15 17:27:45 +000011074static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075unicode_length(PyUnicodeObject *self)
11076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (PyUnicode_READY(self) == -1)
11078 return -1;
11079 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080}
11081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011082PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011085Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011086done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087
11088static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011089unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011091 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 Py_UCS4 fillchar = ' ';
11093
11094 if (PyUnicode_READY(self) == -1)
11095 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011096
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011097 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 return NULL;
11099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101 Py_INCREF(self);
11102 return (PyObject*) self;
11103 }
11104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106}
11107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011111Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112
11113static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011114unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 return fixup(self, fixlower);
11117}
11118
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011119#define LEFTSTRIP 0
11120#define RIGHTSTRIP 1
11121#define BOTHSTRIP 2
11122
11123/* Arrays indexed by above */
11124static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11125
11126#define STRIPNAME(i) (stripformat[i]+3)
11127
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011128/* externally visible for str.strip(unicode) */
11129PyObject *
11130_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 void *data;
11133 int kind;
11134 Py_ssize_t i, j, len;
11135 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11138 return NULL;
11139
11140 kind = PyUnicode_KIND(self);
11141 data = PyUnicode_DATA(self);
11142 len = PyUnicode_GET_LENGTH(self);
11143 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11144 PyUnicode_DATA(sepobj),
11145 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011146
Benjamin Peterson14339b62009-01-31 16:36:08 +000011147 i = 0;
11148 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 while (i < len &&
11150 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 i++;
11152 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011153 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011154
Benjamin Peterson14339b62009-01-31 16:36:08 +000011155 j = len;
11156 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011157 do {
11158 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 } while (j >= i &&
11160 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011163
Victor Stinner12bab6d2011-10-01 01:53:49 +020011164 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165}
11166
11167PyObject*
11168PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11169{
11170 unsigned char *data;
11171 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011172 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173
Victor Stinnerde636f32011-10-01 03:55:54 +020011174 if (PyUnicode_READY(self) == -1)
11175 return NULL;
11176
11177 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11178
Victor Stinner12bab6d2011-10-01 01:53:49 +020011179 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011181 if (PyUnicode_CheckExact(self)) {
11182 Py_INCREF(self);
11183 return self;
11184 }
11185 else
11186 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 }
11188
Victor Stinner12bab6d2011-10-01 01:53:49 +020011189 length = end - start;
11190 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011191 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192
Victor Stinnerde636f32011-10-01 03:55:54 +020011193 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011194 PyErr_SetString(PyExc_IndexError, "string index out of range");
11195 return NULL;
11196 }
11197
Victor Stinnerb9275c12011-10-05 14:01:42 +020011198 if (PyUnicode_IS_ASCII(self)) {
11199 kind = PyUnicode_KIND(self);
11200 data = PyUnicode_1BYTE_DATA(self);
11201 return unicode_fromascii(data + start, length);
11202 }
11203 else {
11204 kind = PyUnicode_KIND(self);
11205 data = PyUnicode_1BYTE_DATA(self);
11206 return PyUnicode_FromKindAndData(kind,
11207 data + PyUnicode_KIND_SIZE(kind, start),
11208 length);
11209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211
11212static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011213do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 int kind;
11216 void *data;
11217 Py_ssize_t len, i, j;
11218
11219 if (PyUnicode_READY(self) == -1)
11220 return NULL;
11221
11222 kind = PyUnicode_KIND(self);
11223 data = PyUnicode_DATA(self);
11224 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011225
Benjamin Peterson14339b62009-01-31 16:36:08 +000011226 i = 0;
11227 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011229 i++;
11230 }
11231 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011232
Benjamin Peterson14339b62009-01-31 16:36:08 +000011233 j = len;
11234 if (striptype != LEFTSTRIP) {
11235 do {
11236 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011238 j++;
11239 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011240
Victor Stinner12bab6d2011-10-01 01:53:49 +020011241 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242}
11243
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011244
11245static PyObject *
11246do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011248 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011249
Benjamin Peterson14339b62009-01-31 16:36:08 +000011250 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11251 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011252
Benjamin Peterson14339b62009-01-31 16:36:08 +000011253 if (sep != NULL && sep != Py_None) {
11254 if (PyUnicode_Check(sep))
11255 return _PyUnicode_XStrip(self, striptype, sep);
11256 else {
11257 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "%s arg must be None or str",
11259 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011260 return NULL;
11261 }
11262 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011263
Benjamin Peterson14339b62009-01-31 16:36:08 +000011264 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011265}
11266
11267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011270\n\
11271Return a copy of the string S with leading and trailing\n\
11272whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011273If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011274
11275static PyObject *
11276unicode_strip(PyUnicodeObject *self, PyObject *args)
11277{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011278 if (PyTuple_GET_SIZE(args) == 0)
11279 return do_strip(self, BOTHSTRIP); /* Common case */
11280 else
11281 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011282}
11283
11284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011285PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011287\n\
11288Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011289If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011290
11291static PyObject *
11292unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11293{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011294 if (PyTuple_GET_SIZE(args) == 0)
11295 return do_strip(self, LEFTSTRIP); /* Common case */
11296 else
11297 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011298}
11299
11300
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011301PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011303\n\
11304Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011305If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011306
11307static PyObject *
11308unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11309{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011310 if (PyTuple_GET_SIZE(args) == 0)
11311 return do_strip(self, RIGHTSTRIP); /* Common case */
11312 else
11313 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011314}
11315
11316
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011318unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319{
11320 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Georg Brandl222de0f2009-04-12 12:01:50 +000011323 if (len < 1) {
11324 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011325 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
Tim Peters7a29bd52001-09-12 03:03:31 +000011328 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 /* no repeat, return original string */
11330 Py_INCREF(str);
11331 return (PyObject*) str;
11332 }
Tim Peters8f422462000-09-09 06:13:41 +000011333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (PyUnicode_READY(str) == -1)
11335 return NULL;
11336
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011337 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011338 PyErr_SetString(PyExc_OverflowError,
11339 "repeated string is too long");
11340 return NULL;
11341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 if (!u)
11346 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011347 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (PyUnicode_GET_LENGTH(str) == 1) {
11350 const int kind = PyUnicode_KIND(str);
11351 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11352 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011353 if (kind == PyUnicode_1BYTE_KIND)
11354 memset(to, (unsigned char)fill_char, len);
11355 else {
11356 for (n = 0; n < len; ++n)
11357 PyUnicode_WRITE(kind, to, n, fill_char);
11358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 }
11360 else {
11361 /* number of characters copied this far */
11362 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11363 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11364 char *to = (char *) PyUnicode_DATA(u);
11365 Py_MEMCPY(to, PyUnicode_DATA(str),
11366 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 n = (done <= nchars-done) ? done : nchars-done;
11369 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011370 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 }
11373
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011374 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 return (PyObject*) u;
11376}
11377
Alexander Belopolsky40018472011-02-26 01:02:56 +000011378PyObject *
11379PyUnicode_Replace(PyObject *obj,
11380 PyObject *subobj,
11381 PyObject *replobj,
11382 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383{
11384 PyObject *self;
11385 PyObject *str1;
11386 PyObject *str2;
11387 PyObject *result;
11388
11389 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011390 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011393 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 Py_DECREF(self);
11395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396 }
11397 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011398 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011399 Py_DECREF(self);
11400 Py_DECREF(str1);
11401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404 Py_DECREF(self);
11405 Py_DECREF(str1);
11406 Py_DECREF(str2);
11407 return result;
11408}
11409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011411 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412\n\
11413Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011414old replaced by new. If the optional argument count is\n\
11415given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
11417static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 PyObject *str1;
11421 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011422 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 PyObject *result;
11424
Martin v. Löwis18e16552006-02-15 17:27:45 +000011425 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 str1 = PyUnicode_FromObject(str1);
11430 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11431 return NULL;
11432 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011433 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 Py_DECREF(str1);
11435 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437
11438 result = replace(self, str1, str2, maxcount);
11439
11440 Py_DECREF(str1);
11441 Py_DECREF(str2);
11442 return result;
11443}
11444
Alexander Belopolsky40018472011-02-26 01:02:56 +000011445static PyObject *
11446unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011448 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 Py_ssize_t isize;
11450 Py_ssize_t osize, squote, dquote, i, o;
11451 Py_UCS4 max, quote;
11452 int ikind, okind;
11453 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011456 return NULL;
11457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 isize = PyUnicode_GET_LENGTH(unicode);
11459 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 /* Compute length of output, quote characters, and
11462 maximum character */
11463 osize = 2; /* quotes */
11464 max = 127;
11465 squote = dquote = 0;
11466 ikind = PyUnicode_KIND(unicode);
11467 for (i = 0; i < isize; i++) {
11468 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11469 switch (ch) {
11470 case '\'': squote++; osize++; break;
11471 case '"': dquote++; osize++; break;
11472 case '\\': case '\t': case '\r': case '\n':
11473 osize += 2; break;
11474 default:
11475 /* Fast-path ASCII */
11476 if (ch < ' ' || ch == 0x7f)
11477 osize += 4; /* \xHH */
11478 else if (ch < 0x7f)
11479 osize++;
11480 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11481 osize++;
11482 max = ch > max ? ch : max;
11483 }
11484 else if (ch < 0x100)
11485 osize += 4; /* \xHH */
11486 else if (ch < 0x10000)
11487 osize += 6; /* \uHHHH */
11488 else
11489 osize += 10; /* \uHHHHHHHH */
11490 }
11491 }
11492
11493 quote = '\'';
11494 if (squote) {
11495 if (dquote)
11496 /* Both squote and dquote present. Use squote,
11497 and escape them */
11498 osize += squote;
11499 else
11500 quote = '"';
11501 }
11502
11503 repr = PyUnicode_New(osize, max);
11504 if (repr == NULL)
11505 return NULL;
11506 okind = PyUnicode_KIND(repr);
11507 odata = PyUnicode_DATA(repr);
11508
11509 PyUnicode_WRITE(okind, odata, 0, quote);
11510 PyUnicode_WRITE(okind, odata, osize-1, quote);
11511
11512 for (i = 0, o = 1; i < isize; i++) {
11513 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011514
11515 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 if ((ch == quote) || (ch == '\\')) {
11517 PyUnicode_WRITE(okind, odata, o++, '\\');
11518 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011519 continue;
11520 }
11521
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011523 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 PyUnicode_WRITE(okind, odata, o++, '\\');
11525 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011526 }
11527 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 PyUnicode_WRITE(okind, odata, o++, '\\');
11529 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011530 }
11531 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 PyUnicode_WRITE(okind, odata, o++, '\\');
11533 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011534 }
11535
11536 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011537 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 PyUnicode_WRITE(okind, odata, o++, '\\');
11539 PyUnicode_WRITE(okind, odata, o++, 'x');
11540 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11541 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011542 }
11543
Georg Brandl559e5d72008-06-11 18:37:52 +000011544 /* Copy ASCII characters as-is */
11545 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011547 }
11548
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011550 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011551 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011552 (categories Z* and C* except ASCII space)
11553 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011555 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 if (ch <= 0xff) {
11557 PyUnicode_WRITE(okind, odata, o++, '\\');
11558 PyUnicode_WRITE(okind, odata, o++, 'x');
11559 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11560 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011561 }
11562 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 else if (ch >= 0x10000) {
11564 PyUnicode_WRITE(okind, odata, o++, '\\');
11565 PyUnicode_WRITE(okind, odata, o++, 'U');
11566 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11567 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11569 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11570 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11571 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11572 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11573 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011574 }
11575 /* Map 16-bit characters to '\uxxxx' */
11576 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 PyUnicode_WRITE(okind, odata, o++, '\\');
11578 PyUnicode_WRITE(okind, odata, o++, 'u');
11579 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11580 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11581 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11582 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011583 }
11584 }
11585 /* Copy characters as-is */
11586 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011588 }
11589 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 /* Closing quote already added at the beginning */
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011592 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011593 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594}
11595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011596PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598\n\
11599Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011600such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601arguments start and end are interpreted as in slice notation.\n\
11602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
11605static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607{
Jesus Ceaac451502011-04-20 17:09:23 +020011608 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011609 Py_ssize_t start;
11610 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011611 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
Jesus Ceaac451502011-04-20 17:09:23 +020011613 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11614 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 if (PyUnicode_READY(self) == -1)
11618 return NULL;
11619 if (PyUnicode_READY(substring) == -1)
11620 return NULL;
11621
11622 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011623 asciilib_rfind_slice, ucs1lib_rfind_slice,
11624 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011626 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627
11628 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (result == -2)
11631 return NULL;
11632
Christian Heimes217cfd12007-12-02 14:31:20 +000011633 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011639Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
11641static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Jesus Ceaac451502011-04-20 17:09:23 +020011644 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011645 Py_ssize_t start;
11646 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011647 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648
Jesus Ceaac451502011-04-20 17:09:23 +020011649 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11650 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 if (PyUnicode_READY(self) == -1)
11654 return NULL;
11655 if (PyUnicode_READY(substring) == -1)
11656 return NULL;
11657
11658 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011659 asciilib_rfind_slice, ucs1lib_rfind_slice,
11660 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011662 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
11664 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (result == -2)
11667 return NULL;
11668
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669 if (result < 0) {
11670 PyErr_SetString(PyExc_ValueError, "substring not found");
11671 return NULL;
11672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673
Christian Heimes217cfd12007-12-02 14:31:20 +000011674 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675}
11676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011677PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011678 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011680Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011681done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
11683static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011684unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011686 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 Py_UCS4 fillchar = ' ';
11688
Victor Stinnere9a29352011-10-01 02:14:59 +020011689 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011691
Victor Stinnere9a29352011-10-01 02:14:59 +020011692 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 return NULL;
11694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696 Py_INCREF(self);
11697 return (PyObject*) self;
11698 }
11699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701}
11702
Alexander Belopolsky40018472011-02-26 01:02:56 +000011703PyObject *
11704PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705{
11706 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011707
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708 s = PyUnicode_FromObject(s);
11709 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011710 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 if (sep != NULL) {
11712 sep = PyUnicode_FromObject(sep);
11713 if (sep == NULL) {
11714 Py_DECREF(s);
11715 return NULL;
11716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 }
11718
Victor Stinner9310abb2011-10-05 00:59:23 +020011719 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
11721 Py_DECREF(s);
11722 Py_XDECREF(sep);
11723 return result;
11724}
11725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011726PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011727 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728\n\
11729Return a list of the words in S, using sep as the\n\
11730delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011731splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011732whitespace string is a separator and empty strings are\n\
11733removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011736unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
11738 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011739 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740
Martin v. Löwis18e16552006-02-15 17:27:45 +000011741 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 return NULL;
11743
11744 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011747 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750}
11751
Thomas Wouters477c8d52006-05-27 19:21:47 +000011752PyObject *
11753PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11754{
11755 PyObject* str_obj;
11756 PyObject* sep_obj;
11757 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 int kind1, kind2, kind;
11759 void *buf1 = NULL, *buf2 = NULL;
11760 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761
11762 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011763 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011765 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011767 Py_DECREF(str_obj);
11768 return NULL;
11769 }
11770
Victor Stinner14f8f022011-10-05 20:58:25 +020011771 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011773 kind = Py_MAX(kind1, kind2);
11774 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011776 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (!buf1)
11778 goto onError;
11779 buf2 = PyUnicode_DATA(sep_obj);
11780 if (kind2 != kind)
11781 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11782 if (!buf2)
11783 goto onError;
11784 len1 = PyUnicode_GET_LENGTH(str_obj);
11785 len2 = PyUnicode_GET_LENGTH(sep_obj);
11786
Victor Stinner14f8f022011-10-05 20:58:25 +020011787 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011789 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11790 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11791 else
11792 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 break;
11794 case PyUnicode_2BYTE_KIND:
11795 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11796 break;
11797 case PyUnicode_4BYTE_KIND:
11798 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11799 break;
11800 default:
11801 assert(0);
11802 out = 0;
11803 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011804
11805 Py_DECREF(sep_obj);
11806 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 if (kind1 != kind)
11808 PyMem_Free(buf1);
11809 if (kind2 != kind)
11810 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011811
11812 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 onError:
11814 Py_DECREF(sep_obj);
11815 Py_DECREF(str_obj);
11816 if (kind1 != kind && buf1)
11817 PyMem_Free(buf1);
11818 if (kind2 != kind && buf2)
11819 PyMem_Free(buf2);
11820 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011821}
11822
11823
11824PyObject *
11825PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11826{
11827 PyObject* str_obj;
11828 PyObject* sep_obj;
11829 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 int kind1, kind2, kind;
11831 void *buf1 = NULL, *buf2 = NULL;
11832 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011833
11834 str_obj = PyUnicode_FromObject(str_in);
11835 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011837 sep_obj = PyUnicode_FromObject(sep_in);
11838 if (!sep_obj) {
11839 Py_DECREF(str_obj);
11840 return NULL;
11841 }
11842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 kind1 = PyUnicode_KIND(str_in);
11844 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011845 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 buf1 = PyUnicode_DATA(str_in);
11847 if (kind1 != kind)
11848 buf1 = _PyUnicode_AsKind(str_in, kind);
11849 if (!buf1)
11850 goto onError;
11851 buf2 = PyUnicode_DATA(sep_obj);
11852 if (kind2 != kind)
11853 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11854 if (!buf2)
11855 goto onError;
11856 len1 = PyUnicode_GET_LENGTH(str_obj);
11857 len2 = PyUnicode_GET_LENGTH(sep_obj);
11858
11859 switch(PyUnicode_KIND(str_in)) {
11860 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011861 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11862 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11863 else
11864 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 break;
11866 case PyUnicode_2BYTE_KIND:
11867 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11868 break;
11869 case PyUnicode_4BYTE_KIND:
11870 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11871 break;
11872 default:
11873 assert(0);
11874 out = 0;
11875 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011876
11877 Py_DECREF(sep_obj);
11878 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 if (kind1 != kind)
11880 PyMem_Free(buf1);
11881 if (kind2 != kind)
11882 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011883
11884 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 onError:
11886 Py_DECREF(sep_obj);
11887 Py_DECREF(str_obj);
11888 if (kind1 != kind && buf1)
11889 PyMem_Free(buf1);
11890 if (kind2 != kind && buf2)
11891 PyMem_Free(buf2);
11892 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011893}
11894
11895PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011897\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011898Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011899the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011900found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901
11902static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011903unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011904{
Victor Stinner9310abb2011-10-05 00:59:23 +020011905 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011906}
11907
11908PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011909 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011910\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011911Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011912the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011913separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011914
11915static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011916unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917{
Victor Stinner9310abb2011-10-05 00:59:23 +020011918 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011919}
11920
Alexander Belopolsky40018472011-02-26 01:02:56 +000011921PyObject *
11922PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011923{
11924 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011926 s = PyUnicode_FromObject(s);
11927 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011928 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 if (sep != NULL) {
11930 sep = PyUnicode_FromObject(sep);
11931 if (sep == NULL) {
11932 Py_DECREF(s);
11933 return NULL;
11934 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011935 }
11936
Victor Stinner9310abb2011-10-05 00:59:23 +020011937 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011938
11939 Py_DECREF(s);
11940 Py_XDECREF(sep);
11941 return result;
11942}
11943
11944PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011946\n\
11947Return a list of the words in S, using sep as the\n\
11948delimiter string, starting at the end of the string and\n\
11949working to the front. If maxsplit is given, at most maxsplit\n\
11950splits are done. If sep is not specified, any whitespace string\n\
11951is a separator.");
11952
11953static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011954unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011955{
11956 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011957 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011958
Martin v. Löwis18e16552006-02-15 17:27:45 +000011959 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011960 return NULL;
11961
11962 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011964 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011965 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011966 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011967 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011968}
11969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011970PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972\n\
11973Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011974Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011975is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
11977static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011978unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011980 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011981 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011983 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11984 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 return NULL;
11986
Guido van Rossum86662912000-04-11 15:38:46 +000011987 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988}
11989
11990static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011991PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Walter Dörwald346737f2007-05-31 10:44:43 +000011993 if (PyUnicode_CheckExact(self)) {
11994 Py_INCREF(self);
11995 return self;
11996 } else
11997 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011998 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999}
12000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012001PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003\n\
12004Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012005and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
12007static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012008unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010 return fixup(self, fixswapcase);
12011}
12012
Georg Brandlceee0772007-11-27 23:48:05 +000012013PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012015\n\
12016Return a translation table usable for str.translate().\n\
12017If there is only one argument, it must be a dictionary mapping Unicode\n\
12018ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012019Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012020If there are two arguments, they must be strings of equal length, and\n\
12021in the resulting dictionary, each character in x will be mapped to the\n\
12022character at the same position in y. If there is a third argument, it\n\
12023must be a string, whose characters will be mapped to None in the result.");
12024
12025static PyObject*
12026unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12027{
12028 PyObject *x, *y = NULL, *z = NULL;
12029 PyObject *new = NULL, *key, *value;
12030 Py_ssize_t i = 0;
12031 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032
Georg Brandlceee0772007-11-27 23:48:05 +000012033 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12034 return NULL;
12035 new = PyDict_New();
12036 if (!new)
12037 return NULL;
12038 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 int x_kind, y_kind, z_kind;
12040 void *x_data, *y_data, *z_data;
12041
Georg Brandlceee0772007-11-27 23:48:05 +000012042 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012043 if (!PyUnicode_Check(x)) {
12044 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12045 "be a string if there is a second argument");
12046 goto err;
12047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012049 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12050 "arguments must have equal length");
12051 goto err;
12052 }
12053 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 x_kind = PyUnicode_KIND(x);
12055 y_kind = PyUnicode_KIND(y);
12056 x_data = PyUnicode_DATA(x);
12057 y_data = PyUnicode_DATA(y);
12058 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12059 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12060 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012061 if (!key || !value)
12062 goto err;
12063 res = PyDict_SetItem(new, key, value);
12064 Py_DECREF(key);
12065 Py_DECREF(value);
12066 if (res < 0)
12067 goto err;
12068 }
12069 /* create entries for deleting chars in z */
12070 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 z_kind = PyUnicode_KIND(z);
12072 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012073 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012075 if (!key)
12076 goto err;
12077 res = PyDict_SetItem(new, key, Py_None);
12078 Py_DECREF(key);
12079 if (res < 0)
12080 goto err;
12081 }
12082 }
12083 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 int kind;
12085 void *data;
12086
Georg Brandlceee0772007-11-27 23:48:05 +000012087 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012088 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012089 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12090 "to maketrans it must be a dict");
12091 goto err;
12092 }
12093 /* copy entries into the new dict, converting string keys to int keys */
12094 while (PyDict_Next(x, &i, &key, &value)) {
12095 if (PyUnicode_Check(key)) {
12096 /* convert string keys to integer keys */
12097 PyObject *newkey;
12098 if (PyUnicode_GET_SIZE(key) != 1) {
12099 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12100 "table must be of length 1");
12101 goto err;
12102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 kind = PyUnicode_KIND(key);
12104 data = PyUnicode_DATA(key);
12105 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012106 if (!newkey)
12107 goto err;
12108 res = PyDict_SetItem(new, newkey, value);
12109 Py_DECREF(newkey);
12110 if (res < 0)
12111 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012112 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012113 /* just keep integer keys */
12114 if (PyDict_SetItem(new, key, value) < 0)
12115 goto err;
12116 } else {
12117 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12118 "be strings or integers");
12119 goto err;
12120 }
12121 }
12122 }
12123 return new;
12124 err:
12125 Py_DECREF(new);
12126 return NULL;
12127}
12128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012129PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131\n\
12132Return a copy of the string S, where all characters have been mapped\n\
12133through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012134Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012135Unmapped characters are left untouched. Characters mapped to None\n\
12136are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
12138static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142}
12143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012144PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012147Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
12149static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012150unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152 return fixup(self, fixupper);
12153}
12154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012155PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012158Pad a numeric string S with zeros on the left, to fill a field\n\
12159of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
12161static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012162unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012164 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012165 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012166 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 int kind;
12168 void *data;
12169 Py_UCS4 chr;
12170
12171 if (PyUnicode_READY(self) == -1)
12172 return NULL;
12173
Martin v. Löwis18e16552006-02-15 17:27:45 +000012174 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 return NULL;
12176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012177 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012178 if (PyUnicode_CheckExact(self)) {
12179 Py_INCREF(self);
12180 return (PyObject*) self;
12181 }
12182 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012183 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 }
12185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
12188 u = pad(self, fill, 0, '0');
12189
Walter Dörwald068325e2002-04-15 13:36:47 +000012190 if (u == NULL)
12191 return NULL;
12192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 kind = PyUnicode_KIND(u);
12194 data = PyUnicode_DATA(u);
12195 chr = PyUnicode_READ(kind, data, fill);
12196
12197 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 PyUnicode_WRITE(kind, data, 0, chr);
12200 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 }
12202
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012203 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 return (PyObject*) u;
12205}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
12207#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012208static PyObject *
12209unicode__decimal2ascii(PyObject *self)
12210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012212}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213#endif
12214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012215PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012218Return True if S starts with the specified prefix, False otherwise.\n\
12219With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012220With optional end, stop comparing S at that position.\n\
12221prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222
12223static PyObject *
12224unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012227 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012229 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012230 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012231 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232
Jesus Ceaac451502011-04-20 17:09:23 +020012233 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012235 if (PyTuple_Check(subobj)) {
12236 Py_ssize_t i;
12237 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12238 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012240 if (substring == NULL)
12241 return NULL;
12242 result = tailmatch(self, substring, start, end, -1);
12243 Py_DECREF(substring);
12244 if (result) {
12245 Py_RETURN_TRUE;
12246 }
12247 }
12248 /* nothing matched */
12249 Py_RETURN_FALSE;
12250 }
12251 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012252 if (substring == NULL) {
12253 if (PyErr_ExceptionMatches(PyExc_TypeError))
12254 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12255 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012257 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012258 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012260 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261}
12262
12263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012264PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012267Return True if S ends with the specified suffix, False otherwise.\n\
12268With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012269With optional end, stop comparing S at that position.\n\
12270suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
12272static PyObject *
12273unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012276 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012278 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012279 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012280 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
Jesus Ceaac451502011-04-20 17:09:23 +020012282 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012284 if (PyTuple_Check(subobj)) {
12285 Py_ssize_t i;
12286 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12287 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012289 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012290 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012291 result = tailmatch(self, substring, start, end, +1);
12292 Py_DECREF(substring);
12293 if (result) {
12294 Py_RETURN_TRUE;
12295 }
12296 }
12297 Py_RETURN_FALSE;
12298 }
12299 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012300 if (substring == NULL) {
12301 if (PyErr_ExceptionMatches(PyExc_TypeError))
12302 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12303 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012305 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012306 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012308 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309}
12310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012312
12313PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012315\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012316Return a formatted version of S, using substitutions from args and kwargs.\n\
12317The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012318
Eric Smith27bbca62010-11-04 17:06:58 +000012319PyDoc_STRVAR(format_map__doc__,
12320 "S.format_map(mapping) -> str\n\
12321\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012322Return a formatted version of S, using substitutions from mapping.\n\
12323The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012324
Eric Smith4a7d76d2008-05-30 18:10:19 +000012325static PyObject *
12326unicode__format__(PyObject* self, PyObject* args)
12327{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012328 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012329
12330 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12331 return NULL;
12332
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012333 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012335 if (out != NULL)
12336 assert(_PyUnicode_CheckConsistency(out, 1));
12337 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012338}
12339
Eric Smith8c663262007-08-25 02:26:07 +000012340PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012342\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012343Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012344
12345static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012346unicode__sizeof__(PyUnicodeObject *v)
12347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 Py_ssize_t size;
12349
12350 /* If it's a compact object, account for base structure +
12351 character data. */
12352 if (PyUnicode_IS_COMPACT_ASCII(v))
12353 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12354 else if (PyUnicode_IS_COMPACT(v))
12355 size = sizeof(PyCompactUnicodeObject) +
12356 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12357 else {
12358 /* If it is a two-block object, account for base object, and
12359 for character block if present. */
12360 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012361 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 size += (PyUnicode_GET_LENGTH(v) + 1) *
12363 PyUnicode_CHARACTER_SIZE(v);
12364 }
12365 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012366 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012367 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012369 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012370 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371
12372 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012373}
12374
12375PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012377
12378static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012379unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012380{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012381 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 if (!copy)
12383 return NULL;
12384 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012385}
12386
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387static PyMethodDef unicode_methods[] = {
12388
12389 /* Order is according to common usage: often used methods should
12390 appear first, since lookup is done sequentially. */
12391
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012392 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012393 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12394 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012395 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012396 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12397 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12398 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12399 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12400 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12401 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12402 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012403 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012404 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12405 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12406 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012407 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012408 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12409 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12410 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012411 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012412 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012413 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012414 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012415 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12416 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12417 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12418 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12419 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12420 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12421 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12422 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12423 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12424 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12425 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12426 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12427 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12428 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012429 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012430 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012431 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012432 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012433 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012434 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012435 {"maketrans", (PyCFunction) unicode_maketrans,
12436 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012437 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012438#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012439 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012440#endif
12441
12442#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012443 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012444 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445#endif
12446
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448 {NULL, NULL}
12449};
12450
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012451static PyObject *
12452unicode_mod(PyObject *v, PyObject *w)
12453{
Brian Curtindfc80e32011-08-10 20:28:54 -050012454 if (!PyUnicode_Check(v))
12455 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012456 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012457}
12458
12459static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 0, /*nb_add*/
12461 0, /*nb_subtract*/
12462 0, /*nb_multiply*/
12463 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012464};
12465
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 (lenfunc) unicode_length, /* sq_length */
12468 PyUnicode_Concat, /* sq_concat */
12469 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12470 (ssizeargfunc) unicode_getitem, /* sq_item */
12471 0, /* sq_slice */
12472 0, /* sq_ass_item */
12473 0, /* sq_ass_slice */
12474 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475};
12476
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012477static PyObject*
12478unicode_subscript(PyUnicodeObject* self, PyObject* item)
12479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 if (PyUnicode_READY(self) == -1)
12481 return NULL;
12482
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012483 if (PyIndex_Check(item)) {
12484 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012485 if (i == -1 && PyErr_Occurred())
12486 return NULL;
12487 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012489 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012490 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012491 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012492 PyObject *result;
12493 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012494 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012495 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012497 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012499 return NULL;
12500 }
12501
12502 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 return PyUnicode_New(0, 0);
12504 } else if (start == 0 && step == 1 &&
12505 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012506 PyUnicode_CheckExact(self)) {
12507 Py_INCREF(self);
12508 return (PyObject *)self;
12509 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012510 return PyUnicode_Substring((PyObject*)self,
12511 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012512 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012513 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012514 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012515 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012516 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012517 src_data = PyUnicode_DATA(self);
12518 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12519 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012520 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012521 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012522 if (max_char >= kind_limit)
12523 break;
12524 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012525 }
12526 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012527 if (result == NULL)
12528 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012529 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012530 dest_data = PyUnicode_DATA(result);
12531
12532 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012533 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12534 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012535 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012536 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012537 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012538 } else {
12539 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12540 return NULL;
12541 }
12542}
12543
12544static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012545 (lenfunc)unicode_length, /* mp_length */
12546 (binaryfunc)unicode_subscript, /* mp_subscript */
12547 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012548};
12549
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551/* Helpers for PyUnicode_Format() */
12552
12553static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012554getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012556 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 (*p_argidx)++;
12559 if (arglen < 0)
12560 return args;
12561 else
12562 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 }
12564 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566 return NULL;
12567}
12568
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012569/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012571static PyObject *
12572formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012574 char *p;
12575 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012577
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578 x = PyFloat_AsDouble(v);
12579 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012580 return NULL;
12581
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012584
Eric Smith0923d1d2009-04-16 20:16:10 +000012585 p = PyOS_double_to_string(x, type, prec,
12586 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012587 if (p == NULL)
12588 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012590 PyMem_Free(p);
12591 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592}
12593
Tim Peters38fd5b62000-09-21 05:43:11 +000012594static PyObject*
12595formatlong(PyObject *val, int flags, int prec, int type)
12596{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 char *buf;
12598 int len;
12599 PyObject *str; /* temporary string object. */
12600 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012601
Benjamin Peterson14339b62009-01-31 16:36:08 +000012602 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12603 if (!str)
12604 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012606 Py_DECREF(str);
12607 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012608}
12609
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012612 size_t buflen,
12613 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012615 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012616 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 if (PyUnicode_GET_LENGTH(v) == 1) {
12618 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 buf[1] = '\0';
12620 return 1;
12621 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 goto onError;
12623 }
12624 else {
12625 /* Integer input truncated to a character */
12626 long x;
12627 x = PyLong_AsLong(v);
12628 if (x == -1 && PyErr_Occurred())
12629 goto onError;
12630
12631 if (x < 0 || x > 0x10ffff) {
12632 PyErr_SetString(PyExc_OverflowError,
12633 "%c arg not in range(0x110000)");
12634 return -1;
12635 }
12636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012638 buf[1] = '\0';
12639 return 1;
12640 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012641
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012643 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012645 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646}
12647
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012648/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012649 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012650*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012651#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012652
Alexander Belopolsky40018472011-02-26 01:02:56 +000012653PyObject *
12654PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 void *fmt;
12657 int fmtkind;
12658 PyObject *result;
12659 Py_UCS4 *res, *res0;
12660 Py_UCS4 max;
12661 int kind;
12662 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012666
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 PyErr_BadInternalCall();
12669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12672 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012673 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 fmt = PyUnicode_DATA(uformat);
12675 fmtkind = PyUnicode_KIND(uformat);
12676 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12677 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678
12679 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12681 if (res0 == NULL) {
12682 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012683 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
12686 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 arglen = PyTuple_Size(args);
12688 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689 }
12690 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 arglen = -1;
12692 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012694 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012695 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697
12698 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 if (--rescnt < 0) {
12701 rescnt = fmtcnt + 100;
12702 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12704 if (res0 == NULL){
12705 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 }
12708 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012712 }
12713 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 /* Got a format specifier */
12715 int flags = 0;
12716 Py_ssize_t width = -1;
12717 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 Py_UCS4 c = '\0';
12719 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 int isnumok;
12721 PyObject *v = NULL;
12722 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 void *pbuf;
12724 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 Py_ssize_t len, len1;
12727 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 fmtpos++;
12730 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12731 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 Py_ssize_t keylen;
12733 PyObject *key;
12734 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012735
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 if (dict == NULL) {
12737 PyErr_SetString(PyExc_TypeError,
12738 "format requires a mapping");
12739 goto onError;
12740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 /* Skip over balanced parentheses */
12745 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 if (fmtcnt < 0 || pcount > 0) {
12754 PyErr_SetString(PyExc_ValueError,
12755 "incomplete format key");
12756 goto onError;
12757 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012758 key = PyUnicode_Substring((PyObject*)uformat,
12759 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 if (key == NULL)
12761 goto onError;
12762 if (args_owned) {
12763 Py_DECREF(args);
12764 args_owned = 0;
12765 }
12766 args = PyObject_GetItem(dict, key);
12767 Py_DECREF(key);
12768 if (args == NULL) {
12769 goto onError;
12770 }
12771 args_owned = 1;
12772 arglen = -1;
12773 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012774 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 case '-': flags |= F_LJUST; continue;
12778 case '+': flags |= F_SIGN; continue;
12779 case ' ': flags |= F_BLANK; continue;
12780 case '#': flags |= F_ALT; continue;
12781 case '0': flags |= F_ZERO; continue;
12782 }
12783 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012784 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 if (c == '*') {
12786 v = getnextarg(args, arglen, &argidx);
12787 if (v == NULL)
12788 goto onError;
12789 if (!PyLong_Check(v)) {
12790 PyErr_SetString(PyExc_TypeError,
12791 "* wants int");
12792 goto onError;
12793 }
12794 width = PyLong_AsLong(v);
12795 if (width == -1 && PyErr_Occurred())
12796 goto onError;
12797 if (width < 0) {
12798 flags |= F_LJUST;
12799 width = -width;
12800 }
12801 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 }
12804 else if (c >= '0' && c <= '9') {
12805 width = c - '0';
12806 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 if (c < '0' || c > '9')
12809 break;
12810 if ((width*10) / 10 != width) {
12811 PyErr_SetString(PyExc_ValueError,
12812 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012813 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 }
12815 width = width*10 + (c - '0');
12816 }
12817 }
12818 if (c == '.') {
12819 prec = 0;
12820 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 if (c == '*') {
12823 v = getnextarg(args, arglen, &argidx);
12824 if (v == NULL)
12825 goto onError;
12826 if (!PyLong_Check(v)) {
12827 PyErr_SetString(PyExc_TypeError,
12828 "* wants int");
12829 goto onError;
12830 }
12831 prec = PyLong_AsLong(v);
12832 if (prec == -1 && PyErr_Occurred())
12833 goto onError;
12834 if (prec < 0)
12835 prec = 0;
12836 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012838 }
12839 else if (c >= '0' && c <= '9') {
12840 prec = c - '0';
12841 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 if (c < '0' || c > '9')
12844 break;
12845 if ((prec*10) / 10 != prec) {
12846 PyErr_SetString(PyExc_ValueError,
12847 "prec too big");
12848 goto onError;
12849 }
12850 prec = prec*10 + (c - '0');
12851 }
12852 }
12853 } /* prec */
12854 if (fmtcnt >= 0) {
12855 if (c == 'h' || c == 'l' || c == 'L') {
12856 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 }
12859 }
12860 if (fmtcnt < 0) {
12861 PyErr_SetString(PyExc_ValueError,
12862 "incomplete format");
12863 goto onError;
12864 }
12865 if (c != '%') {
12866 v = getnextarg(args, arglen, &argidx);
12867 if (v == NULL)
12868 goto onError;
12869 }
12870 sign = 0;
12871 fill = ' ';
12872 switch (c) {
12873
12874 case '%':
12875 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 len = 1;
12880 break;
12881
12882 case 's':
12883 case 'r':
12884 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012885 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 temp = v;
12887 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012888 }
12889 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 if (c == 's')
12891 temp = PyObject_Str(v);
12892 else if (c == 'r')
12893 temp = PyObject_Repr(v);
12894 else
12895 temp = PyObject_ASCII(v);
12896 if (temp == NULL)
12897 goto onError;
12898 if (PyUnicode_Check(temp))
12899 /* nothing to do */;
12900 else {
12901 Py_DECREF(temp);
12902 PyErr_SetString(PyExc_TypeError,
12903 "%s argument has non-string str()");
12904 goto onError;
12905 }
12906 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 if (PyUnicode_READY(temp) == -1) {
12908 Py_CLEAR(temp);
12909 goto onError;
12910 }
12911 pbuf = PyUnicode_DATA(temp);
12912 kind = PyUnicode_KIND(temp);
12913 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012914 if (prec >= 0 && len > prec)
12915 len = prec;
12916 break;
12917
12918 case 'i':
12919 case 'd':
12920 case 'u':
12921 case 'o':
12922 case 'x':
12923 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012924 isnumok = 0;
12925 if (PyNumber_Check(v)) {
12926 PyObject *iobj=NULL;
12927
12928 if (PyLong_Check(v)) {
12929 iobj = v;
12930 Py_INCREF(iobj);
12931 }
12932 else {
12933 iobj = PyNumber_Long(v);
12934 }
12935 if (iobj!=NULL) {
12936 if (PyLong_Check(iobj)) {
12937 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012938 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012939 Py_DECREF(iobj);
12940 if (!temp)
12941 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 if (PyUnicode_READY(temp) == -1) {
12943 Py_CLEAR(temp);
12944 goto onError;
12945 }
12946 pbuf = PyUnicode_DATA(temp);
12947 kind = PyUnicode_KIND(temp);
12948 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012949 sign = 1;
12950 }
12951 else {
12952 Py_DECREF(iobj);
12953 }
12954 }
12955 }
12956 if (!isnumok) {
12957 PyErr_Format(PyExc_TypeError,
12958 "%%%c format: a number is required, "
12959 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12960 goto onError;
12961 }
12962 if (flags & F_ZERO)
12963 fill = '0';
12964 break;
12965
12966 case 'e':
12967 case 'E':
12968 case 'f':
12969 case 'F':
12970 case 'g':
12971 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012972 temp = formatfloat(v, flags, prec, c);
12973 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012974 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 if (PyUnicode_READY(temp) == -1) {
12976 Py_CLEAR(temp);
12977 goto onError;
12978 }
12979 pbuf = PyUnicode_DATA(temp);
12980 kind = PyUnicode_KIND(temp);
12981 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012982 sign = 1;
12983 if (flags & F_ZERO)
12984 fill = '0';
12985 break;
12986
12987 case 'c':
12988 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012990 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 if (len < 0)
12992 goto onError;
12993 break;
12994
12995 default:
12996 PyErr_Format(PyExc_ValueError,
12997 "unsupported format character '%c' (0x%x) "
12998 "at index %zd",
12999 (31<=c && c<=126) ? (char)c : '?',
13000 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 goto onError;
13003 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 /* pbuf is initialized here. */
13005 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13008 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13009 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013010 len--;
13011 }
13012 else if (flags & F_SIGN)
13013 sign = '+';
13014 else if (flags & F_BLANK)
13015 sign = ' ';
13016 else
13017 sign = 0;
13018 }
13019 if (width < len)
13020 width = len;
13021 if (rescnt - (sign != 0) < width) {
13022 reslen -= rescnt;
13023 rescnt = width + fmtcnt + 100;
13024 reslen += rescnt;
13025 if (reslen < 0) {
13026 Py_XDECREF(temp);
13027 PyErr_NoMemory();
13028 goto onError;
13029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13031 if (res0 == 0) {
13032 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013033 Py_XDECREF(temp);
13034 goto onError;
13035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 }
13038 if (sign) {
13039 if (fill != ' ')
13040 *res++ = sign;
13041 rescnt--;
13042 if (width > len)
13043 width--;
13044 }
13045 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13047 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13050 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 }
13052 rescnt -= 2;
13053 width -= 2;
13054 if (width < 0)
13055 width = 0;
13056 len -= 2;
13057 }
13058 if (width > len && !(flags & F_LJUST)) {
13059 do {
13060 --rescnt;
13061 *res++ = fill;
13062 } while (--width > len);
13063 }
13064 if (fill == ' ') {
13065 if (sign)
13066 *res++ = sign;
13067 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13069 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13070 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13071 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 }
13073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074 /* Copy all characters, preserving len */
13075 len1 = len;
13076 while (len1--) {
13077 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13078 rescnt--;
13079 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013080 while (--width >= len) {
13081 --rescnt;
13082 *res++ = ' ';
13083 }
13084 if (dict && (argidx < arglen) && c != '%') {
13085 PyErr_SetString(PyExc_TypeError,
13086 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013087 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 goto onError;
13089 }
13090 Py_XDECREF(temp);
13091 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 } /* until end */
13093 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013094 PyErr_SetString(PyExc_TypeError,
13095 "not all arguments converted during string formatting");
13096 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 }
13098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099
13100 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13101 if (*res > max)
13102 max = *res;
13103 result = PyUnicode_New(reslen - rescnt, max);
13104 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 kind = PyUnicode_KIND(result);
13107 for (res = res0; res < res0+reslen-rescnt; res++)
13108 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13109 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 }
13113 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013114 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 return (PyObject *)result;
13116
Benjamin Peterson29060642009-01-31 22:14:21 +000013117 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119 Py_DECREF(uformat);
13120 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122 }
13123 return NULL;
13124}
13125
Jeremy Hylton938ace62002-07-17 16:30:39 +000013126static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013127unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13128
Tim Peters6d6c1a32001-08-02 04:15:00 +000013129static PyObject *
13130unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13131{
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013133 static char *kwlist[] = {"object", "encoding", "errors", 0};
13134 char *encoding = NULL;
13135 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013136
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 if (type != &PyUnicode_Type)
13138 return unicode_subtype_new(type, args, kwds);
13139 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013140 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 return NULL;
13142 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013144 if (encoding == NULL && errors == NULL)
13145 return PyObject_Str(x);
13146 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013148}
13149
Guido van Rossume023fe02001-08-30 03:12:59 +000013150static PyObject *
13151unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13152{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013153 PyUnicodeObject *unicode, *self;
13154 Py_ssize_t length, char_size;
13155 int share_wstr, share_utf8;
13156 unsigned int kind;
13157 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013158
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013160
13161 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13162 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013164 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013165 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013166 return NULL;
13167
13168 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13169 if (self == NULL) {
13170 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013171 return NULL;
13172 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013173 kind = PyUnicode_KIND(unicode);
13174 length = PyUnicode_GET_LENGTH(unicode);
13175
13176 _PyUnicode_LENGTH(self) = length;
13177 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13178 _PyUnicode_STATE(self).interned = 0;
13179 _PyUnicode_STATE(self).kind = kind;
13180 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013181 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013182 _PyUnicode_STATE(self).ready = 1;
13183 _PyUnicode_WSTR(self) = NULL;
13184 _PyUnicode_UTF8_LENGTH(self) = 0;
13185 _PyUnicode_UTF8(self) = NULL;
13186 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013187 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013188
13189 share_utf8 = 0;
13190 share_wstr = 0;
13191 if (kind == PyUnicode_1BYTE_KIND) {
13192 char_size = 1;
13193 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13194 share_utf8 = 1;
13195 }
13196 else if (kind == PyUnicode_2BYTE_KIND) {
13197 char_size = 2;
13198 if (sizeof(wchar_t) == 2)
13199 share_wstr = 1;
13200 }
13201 else {
13202 assert(kind == PyUnicode_4BYTE_KIND);
13203 char_size = 4;
13204 if (sizeof(wchar_t) == 4)
13205 share_wstr = 1;
13206 }
13207
13208 /* Ensure we won't overflow the length. */
13209 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13210 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013212 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013213 data = PyObject_MALLOC((length + 1) * char_size);
13214 if (data == NULL) {
13215 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 goto onError;
13217 }
13218
Victor Stinnerc3c74152011-10-02 20:39:55 +020013219 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013220 if (share_utf8) {
13221 _PyUnicode_UTF8_LENGTH(self) = length;
13222 _PyUnicode_UTF8(self) = data;
13223 }
13224 if (share_wstr) {
13225 _PyUnicode_WSTR_LENGTH(self) = length;
13226 _PyUnicode_WSTR(self) = (wchar_t *)data;
13227 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013229 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13230 PyUnicode_KIND_SIZE(kind, length + 1));
13231 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013232 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013233 return (PyObject *)self;
13234
13235onError:
13236 Py_DECREF(unicode);
13237 Py_DECREF(self);
13238 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013239}
13240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013241PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013243\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013244Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013245encoding defaults to the current default string encoding.\n\
13246errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013247
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013248static PyObject *unicode_iter(PyObject *seq);
13249
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013251 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252 "str", /* tp_name */
13253 sizeof(PyUnicodeObject), /* tp_size */
13254 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013256 (destructor)unicode_dealloc, /* tp_dealloc */
13257 0, /* tp_print */
13258 0, /* tp_getattr */
13259 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013260 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261 unicode_repr, /* tp_repr */
13262 &unicode_as_number, /* tp_as_number */
13263 &unicode_as_sequence, /* tp_as_sequence */
13264 &unicode_as_mapping, /* tp_as_mapping */
13265 (hashfunc) unicode_hash, /* tp_hash*/
13266 0, /* tp_call*/
13267 (reprfunc) unicode_str, /* tp_str */
13268 PyObject_GenericGetAttr, /* tp_getattro */
13269 0, /* tp_setattro */
13270 0, /* tp_as_buffer */
13271 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013273 unicode_doc, /* tp_doc */
13274 0, /* tp_traverse */
13275 0, /* tp_clear */
13276 PyUnicode_RichCompare, /* tp_richcompare */
13277 0, /* tp_weaklistoffset */
13278 unicode_iter, /* tp_iter */
13279 0, /* tp_iternext */
13280 unicode_methods, /* tp_methods */
13281 0, /* tp_members */
13282 0, /* tp_getset */
13283 &PyBaseObject_Type, /* tp_base */
13284 0, /* tp_dict */
13285 0, /* tp_descr_get */
13286 0, /* tp_descr_set */
13287 0, /* tp_dictoffset */
13288 0, /* tp_init */
13289 0, /* tp_alloc */
13290 unicode_new, /* tp_new */
13291 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292};
13293
13294/* Initialize the Unicode implementation */
13295
Thomas Wouters78890102000-07-22 19:25:51 +000013296void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013298 int i;
13299
Thomas Wouters477c8d52006-05-27 19:21:47 +000013300 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013302 0x000A, /* LINE FEED */
13303 0x000D, /* CARRIAGE RETURN */
13304 0x001C, /* FILE SEPARATOR */
13305 0x001D, /* GROUP SEPARATOR */
13306 0x001E, /* RECORD SEPARATOR */
13307 0x0085, /* NEXT LINE */
13308 0x2028, /* LINE SEPARATOR */
13309 0x2029, /* PARAGRAPH SEPARATOR */
13310 };
13311
Fred Drakee4315f52000-05-09 19:53:39 +000013312 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013313 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013314 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013315 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013317
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013318 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013320 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013322
13323 /* initialize the linebreak bloom filter */
13324 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013326 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013327
13328 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013329}
13330
13331/* Finalize the Unicode implementation */
13332
Christian Heimesa156e092008-02-16 07:38:31 +000013333int
13334PyUnicode_ClearFreeList(void)
13335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013337}
13338
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339void
Thomas Wouters78890102000-07-22 19:25:51 +000013340_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013342 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013344 Py_XDECREF(unicode_empty);
13345 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013346
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013347 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 if (unicode_latin1[i]) {
13349 Py_DECREF(unicode_latin1[i]);
13350 unicode_latin1[i] = NULL;
13351 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013352 }
Christian Heimesa156e092008-02-16 07:38:31 +000013353 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013355
Walter Dörwald16807132007-05-25 13:52:07 +000013356void
13357PyUnicode_InternInPlace(PyObject **p)
13358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013359 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13360 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013361#ifdef Py_DEBUG
13362 assert(s != NULL);
13363 assert(_PyUnicode_CHECK(s));
13364#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013365 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013366 return;
13367#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013368 /* If it's a subclass, we don't really know what putting
13369 it in the interned dict might do. */
13370 if (!PyUnicode_CheckExact(s))
13371 return;
13372 if (PyUnicode_CHECK_INTERNED(s))
13373 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013374 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013375 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376 return;
13377 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013378 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013379 if (interned == NULL) {
13380 interned = PyDict_New();
13381 if (interned == NULL) {
13382 PyErr_Clear(); /* Don't leave an exception */
13383 return;
13384 }
13385 }
13386 /* It might be that the GetItem call fails even
13387 though the key is present in the dictionary,
13388 namely when this happens during a stack overflow. */
13389 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013391 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013392
Benjamin Peterson29060642009-01-31 22:14:21 +000013393 if (t) {
13394 Py_INCREF(t);
13395 Py_DECREF(*p);
13396 *p = t;
13397 return;
13398 }
Walter Dörwald16807132007-05-25 13:52:07 +000013399
Benjamin Peterson14339b62009-01-31 16:36:08 +000013400 PyThreadState_GET()->recursion_critical = 1;
13401 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13402 PyErr_Clear();
13403 PyThreadState_GET()->recursion_critical = 0;
13404 return;
13405 }
13406 PyThreadState_GET()->recursion_critical = 0;
13407 /* The two references in interned are not counted by refcnt.
13408 The deallocator will take care of this */
13409 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013411}
13412
13413void
13414PyUnicode_InternImmortal(PyObject **p)
13415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13417
Benjamin Peterson14339b62009-01-31 16:36:08 +000013418 PyUnicode_InternInPlace(p);
13419 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013421 Py_INCREF(*p);
13422 }
Walter Dörwald16807132007-05-25 13:52:07 +000013423}
13424
13425PyObject *
13426PyUnicode_InternFromString(const char *cp)
13427{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013428 PyObject *s = PyUnicode_FromString(cp);
13429 if (s == NULL)
13430 return NULL;
13431 PyUnicode_InternInPlace(&s);
13432 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013433}
13434
Alexander Belopolsky40018472011-02-26 01:02:56 +000013435void
13436_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013437{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 PyObject *keys;
13439 PyUnicodeObject *s;
13440 Py_ssize_t i, n;
13441 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013442
Benjamin Peterson14339b62009-01-31 16:36:08 +000013443 if (interned == NULL || !PyDict_Check(interned))
13444 return;
13445 keys = PyDict_Keys(interned);
13446 if (keys == NULL || !PyList_Check(keys)) {
13447 PyErr_Clear();
13448 return;
13449 }
Walter Dörwald16807132007-05-25 13:52:07 +000013450
Benjamin Peterson14339b62009-01-31 16:36:08 +000013451 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13452 detector, interned unicode strings are not forcibly deallocated;
13453 rather, we give them their stolen references back, and then clear
13454 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013455
Benjamin Peterson14339b62009-01-31 16:36:08 +000013456 n = PyList_GET_SIZE(keys);
13457 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013459 for (i = 0; i < n; i++) {
13460 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013461 if (PyUnicode_READY(s) == -1) {
13462 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 case SSTATE_NOT_INTERNED:
13467 /* XXX Shouldn't happen */
13468 break;
13469 case SSTATE_INTERNED_IMMORTAL:
13470 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013471 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013472 break;
13473 case SSTATE_INTERNED_MORTAL:
13474 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013475 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476 break;
13477 default:
13478 Py_FatalError("Inconsistent interned string state.");
13479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013480 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013481 }
13482 fprintf(stderr, "total size of all interned strings: "
13483 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13484 "mortal/immortal\n", mortal_size, immortal_size);
13485 Py_DECREF(keys);
13486 PyDict_Clear(interned);
13487 Py_DECREF(interned);
13488 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013489}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013490
13491
13492/********************* Unicode Iterator **************************/
13493
13494typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013495 PyObject_HEAD
13496 Py_ssize_t it_index;
13497 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013498} unicodeiterobject;
13499
13500static void
13501unicodeiter_dealloc(unicodeiterobject *it)
13502{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013503 _PyObject_GC_UNTRACK(it);
13504 Py_XDECREF(it->it_seq);
13505 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013506}
13507
13508static int
13509unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13510{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013511 Py_VISIT(it->it_seq);
13512 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013513}
13514
13515static PyObject *
13516unicodeiter_next(unicodeiterobject *it)
13517{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 PyUnicodeObject *seq;
13519 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013520
Benjamin Peterson14339b62009-01-31 16:36:08 +000013521 assert(it != NULL);
13522 seq = it->it_seq;
13523 if (seq == NULL)
13524 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013525 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013527 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13528 int kind = PyUnicode_KIND(seq);
13529 void *data = PyUnicode_DATA(seq);
13530 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13531 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013532 if (item != NULL)
13533 ++it->it_index;
13534 return item;
13535 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013536
Benjamin Peterson14339b62009-01-31 16:36:08 +000013537 Py_DECREF(seq);
13538 it->it_seq = NULL;
13539 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013540}
13541
13542static PyObject *
13543unicodeiter_len(unicodeiterobject *it)
13544{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013545 Py_ssize_t len = 0;
13546 if (it->it_seq)
13547 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13548 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013549}
13550
13551PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13552
13553static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013554 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013556 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013557};
13558
13559PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13561 "str_iterator", /* tp_name */
13562 sizeof(unicodeiterobject), /* tp_basicsize */
13563 0, /* tp_itemsize */
13564 /* methods */
13565 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13566 0, /* tp_print */
13567 0, /* tp_getattr */
13568 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013569 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 0, /* tp_repr */
13571 0, /* tp_as_number */
13572 0, /* tp_as_sequence */
13573 0, /* tp_as_mapping */
13574 0, /* tp_hash */
13575 0, /* tp_call */
13576 0, /* tp_str */
13577 PyObject_GenericGetAttr, /* tp_getattro */
13578 0, /* tp_setattro */
13579 0, /* tp_as_buffer */
13580 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13581 0, /* tp_doc */
13582 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13583 0, /* tp_clear */
13584 0, /* tp_richcompare */
13585 0, /* tp_weaklistoffset */
13586 PyObject_SelfIter, /* tp_iter */
13587 (iternextfunc)unicodeiter_next, /* tp_iternext */
13588 unicodeiter_methods, /* tp_methods */
13589 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013590};
13591
13592static PyObject *
13593unicode_iter(PyObject *seq)
13594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013596
Benjamin Peterson14339b62009-01-31 16:36:08 +000013597 if (!PyUnicode_Check(seq)) {
13598 PyErr_BadInternalCall();
13599 return NULL;
13600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 if (PyUnicode_READY(seq) == -1)
13602 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013603 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13604 if (it == NULL)
13605 return NULL;
13606 it->it_index = 0;
13607 Py_INCREF(seq);
13608 it->it_seq = (PyUnicodeObject *)seq;
13609 _PyObject_GC_TRACK(it);
13610 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013611}
13612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613#define UNIOP(x) Py_UNICODE_##x
13614#define UNIOP_t Py_UNICODE
13615#include "uniops.h"
13616#undef UNIOP
13617#undef UNIOP_t
13618#define UNIOP(x) Py_UCS4_##x
13619#define UNIOP_t Py_UCS4
13620#include "uniops.h"
13621#undef UNIOP
13622#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013623
Victor Stinner71133ff2010-09-01 23:43:53 +000013624Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013625PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013626{
13627 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13628 Py_UNICODE *copy;
13629 Py_ssize_t size;
13630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 if (!PyUnicode_Check(unicode)) {
13632 PyErr_BadArgument();
13633 return NULL;
13634 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013635 /* Ensure we won't overflow the size. */
13636 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13637 PyErr_NoMemory();
13638 return NULL;
13639 }
13640 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13641 size *= sizeof(Py_UNICODE);
13642 copy = PyMem_Malloc(size);
13643 if (copy == NULL) {
13644 PyErr_NoMemory();
13645 return NULL;
13646 }
13647 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13648 return copy;
13649}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013650
Georg Brandl66c221e2010-10-14 07:04:07 +000013651/* A _string module, to export formatter_parser and formatter_field_name_split
13652 to the string.Formatter class implemented in Python. */
13653
13654static PyMethodDef _string_methods[] = {
13655 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13656 METH_O, PyDoc_STR("split the argument as a field name")},
13657 {"formatter_parser", (PyCFunction) formatter_parser,
13658 METH_O, PyDoc_STR("parse the argument as a format string")},
13659 {NULL, NULL}
13660};
13661
13662static struct PyModuleDef _string_module = {
13663 PyModuleDef_HEAD_INIT,
13664 "_string",
13665 PyDoc_STR("string helper module"),
13666 0,
13667 _string_methods,
13668 NULL,
13669 NULL,
13670 NULL,
13671 NULL
13672};
13673
13674PyMODINIT_FUNC
13675PyInit__string(void)
13676{
13677 return PyModule_Create(&_string_module);
13678}
13679
13680
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013681#ifdef __cplusplus
13682}
13683#endif