blob: f89a5f9e688edd8cd38f8b0e95c696d028af243f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
244unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000245 PyObject **errorHandler,const char *encoding, const char *reason,
246 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
247 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
248
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249static void
250raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300251 const char *encoding,
252 const Py_UNICODE *unicode, Py_ssize_t size,
253 Py_ssize_t startpos, Py_ssize_t endpos,
254 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000255
Christian Heimes190d79e2008-01-30 11:58:22 +0000256/* Same for linebreaks */
257static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000259/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000260/* 0x000B, * LINE TABULATION */
261/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000262/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000263 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000265/* 0x001C, * FILE SEPARATOR */
266/* 0x001D, * GROUP SEPARATOR */
267/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 1, 1, 1, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000282};
283
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300284/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
285 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000287PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000289#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 /* This is actually an illegal character, so it should
293 not be passed to unichr. */
294 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#endif
296}
297
Victor Stinner910337b2011-10-03 03:20:16 +0200298#ifdef Py_DEBUG
299static int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200300/* FIXME: use PyObject* type for op */
301_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
327 } else {
328 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
329
330 data = unicode->data.any;
331 if (kind == PyUnicode_WCHAR_KIND) {
332 assert(ascii->state.compact == 0);
333 assert(ascii->state.ascii == 0);
334 assert(ascii->state.ready == 0);
335 assert(ascii->wstr != NULL);
336 assert(data == NULL);
337 assert(compact->utf8 == NULL);
338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
339 }
340 else {
341 assert(kind == PyUnicode_1BYTE_KIND
342 || kind == PyUnicode_2BYTE_KIND
343 || kind == PyUnicode_4BYTE_KIND);
344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ready == 1);
346 assert(data != NULL);
347 if (ascii->state.ascii) {
348 assert (compact->utf8 == data);
349 assert (compact->utf8_length == ascii->length);
350 }
351 else
352 assert (compact->utf8 != data);
353 }
354 }
355 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200356 if (
357#if SIZEOF_WCHAR_T == 2
358 kind == PyUnicode_2BYTE_KIND
359#else
360 kind == PyUnicode_4BYTE_KIND
361#endif
362 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200363 {
364 assert(ascii->wstr == data);
365 assert(compact->wstr_length == ascii->length);
366 } else
367 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200368 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200369
370 if (compact->utf8 == NULL)
371 assert(compact->utf8_length == 0);
372 if (ascii->wstr == NULL)
373 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200375 /* check that the best kind is used */
376 if (check_content && kind != PyUnicode_WCHAR_KIND)
377 {
378 Py_ssize_t i;
379 Py_UCS4 maxchar = 0;
380 void *data = PyUnicode_DATA(ascii);
381 for (i=0; i < ascii->length; i++)
382 {
383 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
388 if (ascii->state.ascii == 0)
389 assert(maxchar >= 128);
390 else
391 assert(maxchar < 128);
392 }
393 else if (kind == PyUnicode_2BYTE_KIND)
394 assert(maxchar >= 0x100);
395 else
396 assert(maxchar >= 0x10000);
397 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400398 return 1;
399}
Victor Stinner910337b2011-10-03 03:20:16 +0200400#endif
401
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402/* --- Bloom Filters ----------------------------------------------------- */
403
404/* stuff to implement simple "bloom filters" for Unicode characters.
405 to keep things simple, we use a single bitmask, using the least 5
406 bits from each unicode characters as the bit index. */
407
408/* the linebreak mask is set up by Unicode_Init below */
409
Antoine Pitrouf068f942010-01-13 14:19:12 +0000410#if LONG_BIT >= 128
411#define BLOOM_WIDTH 128
412#elif LONG_BIT >= 64
413#define BLOOM_WIDTH 64
414#elif LONG_BIT >= 32
415#define BLOOM_WIDTH 32
416#else
417#error "LONG_BIT is smaller than 32"
418#endif
419
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420#define BLOOM_MASK unsigned long
421
422static BLOOM_MASK bloom_linebreak;
423
Antoine Pitrouf068f942010-01-13 14:19:12 +0000424#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
425#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000426
Benjamin Peterson29060642009-01-31 22:14:21 +0000427#define BLOOM_LINEBREAK(ch) \
428 ((ch) < 128U ? ascii_linebreak[(ch)] : \
429 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000430
Alexander Belopolsky40018472011-02-26 01:02:56 +0000431Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000433{
434 /* calculate simple bloom-style bitmask for a given unicode string */
435
Antoine Pitrouf068f942010-01-13 14:19:12 +0000436 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437 Py_ssize_t i;
438
439 mask = 0;
440 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442
443 return mask;
444}
445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446#define BLOOM_MEMBER(mask, chr, str) \
447 (BLOOM(mask, chr) \
448 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450/* --- Unicode Object ----------------------------------------------------- */
451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200453fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
455Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
456 Py_ssize_t size, Py_UCS4 ch,
457 int direction)
458{
459 /* like wcschr, but doesn't stop at NULL characters */
460 Py_ssize_t i;
461 if (direction == 1) {
462 for(i = 0; i < size; i++)
463 if (PyUnicode_READ(kind, s, i) == ch)
464 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
465 }
466 else {
467 for(i = size-1; i >= 0; i--)
468 if (PyUnicode_READ(kind, s, i) == ch)
469 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
470 }
471 return NULL;
472}
473
Victor Stinnerfe226c02011-10-03 03:52:20 +0200474static PyObject*
475resize_compact(PyObject *unicode, Py_ssize_t length)
476{
477 Py_ssize_t char_size;
478 Py_ssize_t struct_size;
479 Py_ssize_t new_size;
480 int share_wstr;
481
482 assert(PyUnicode_IS_READY(unicode));
483 char_size = PyUnicode_CHARACTER_SIZE(unicode);
484 if (PyUnicode_IS_COMPACT_ASCII(unicode))
485 struct_size = sizeof(PyASCIIObject);
486 else
487 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200489
490 _Py_DEC_REFTOTAL;
491 _Py_ForgetReference(unicode);
492
493 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
494 PyErr_NoMemory();
495 return NULL;
496 }
497 new_size = (struct_size + (length + 1) * char_size);
498
499 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
500 if (unicode == NULL) {
501 PyObject_Del(unicode);
502 PyErr_NoMemory();
503 return NULL;
504 }
505 _Py_NewReference(unicode);
506 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200507 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200509 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
510 _PyUnicode_WSTR_LENGTH(unicode) = length;
511 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
513 length, 0);
514 return unicode;
515}
516
Alexander Belopolsky40018472011-02-26 01:02:56 +0000517static int
Victor Stinner95663112011-10-04 01:03:50 +0200518resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519{
Victor Stinner95663112011-10-04 01:03:50 +0200520 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200522 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000523
Victor Stinner95663112011-10-04 01:03:50 +0200524 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200525
526 if (PyUnicode_IS_READY(unicode)) {
527 Py_ssize_t char_size;
528 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200529 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200530 void *data;
531
532 data = _PyUnicode_DATA_ANY(unicode);
533 assert(data != NULL);
534 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200535 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
536 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200537 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
538 {
539 PyObject_DEL(_PyUnicode_UTF8(unicode));
540 _PyUnicode_UTF8(unicode) = NULL;
541 _PyUnicode_UTF8_LENGTH(unicode) = 0;
542 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200543
544 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
545 PyErr_NoMemory();
546 return -1;
547 }
548 new_size = (length + 1) * char_size;
549
550 data = (PyObject *)PyObject_REALLOC(data, new_size);
551 if (data == NULL) {
552 PyErr_NoMemory();
553 return -1;
554 }
555 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200556 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200557 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200558 _PyUnicode_WSTR_LENGTH(unicode) = length;
559 }
560 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200561 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200562 _PyUnicode_UTF8_LENGTH(unicode) = length;
563 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564 _PyUnicode_LENGTH(unicode) = length;
565 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200566 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200568 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200570 }
Victor Stinner95663112011-10-04 01:03:50 +0200571 assert(_PyUnicode_WSTR(unicode) != NULL);
572
573 /* check for integer overflow */
574 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
575 PyErr_NoMemory();
576 return -1;
577 }
578 wstr = _PyUnicode_WSTR(unicode);
579 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
580 if (!wstr) {
581 PyErr_NoMemory();
582 return -1;
583 }
584 _PyUnicode_WSTR(unicode) = wstr;
585 _PyUnicode_WSTR(unicode)[length] = 0;
586 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 return 0;
589}
590
Victor Stinnerfe226c02011-10-03 03:52:20 +0200591static PyObject*
592resize_copy(PyObject *unicode, Py_ssize_t length)
593{
594 Py_ssize_t copy_length;
595 if (PyUnicode_IS_COMPACT(unicode)) {
596 PyObject *copy;
597 assert(PyUnicode_IS_READY(unicode));
598
599 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
600 if (copy == NULL)
601 return NULL;
602
603 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
604 if (PyUnicode_CopyCharacters(copy, 0,
605 unicode, 0,
606 copy_length) < 0)
607 {
608 Py_DECREF(copy);
609 return NULL;
610 }
611 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200612 }
613 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200614 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(_PyUnicode_WSTR(unicode) != NULL);
616 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 if (w == NULL)
619 return NULL;
620 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
621 copy_length = Py_MIN(copy_length, length);
622 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
623 copy_length);
624 return (PyObject*)w;
625 }
626}
627
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000629 Ux0000 terminated; some code (e.g. new_identifier)
630 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631
632 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000633 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635*/
636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637#ifdef Py_DEBUG
638int unicode_old_new_calls = 0;
639#endif
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641static PyUnicodeObject *
642_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646
Thomas Wouters477c8d52006-05-27 19:21:47 +0000647 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 if (length == 0 && unicode_empty != NULL) {
649 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200650 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000653 /* Ensure we won't overflow the size. */
654 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
655 return (PyUnicodeObject *)PyErr_NoMemory();
656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 if (length < 0) {
658 PyErr_SetString(PyExc_SystemError,
659 "Negative size passed to _PyUnicode_New");
660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifdef Py_DEBUG
664 ++unicode_old_new_calls;
665#endif
666
667 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
668 if (unicode == NULL)
669 return NULL;
670 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
671 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
672 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 PyErr_NoMemory();
674 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676
Jeremy Hyltond8082792003-09-16 19:41:39 +0000677 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000678 * the caller fails before initializing str -- unicode_resize()
679 * reads str[0], and the Keep-Alive optimization can keep memory
680 * allocated for str alive across a call to unicode_dealloc(unicode).
681 * We don't want unicode_resize to read uninitialized memory in
682 * that case.
683 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 _PyUnicode_WSTR(unicode)[0] = 0;
685 _PyUnicode_WSTR(unicode)[length] = 0;
686 _PyUnicode_WSTR_LENGTH(unicode) = length;
687 _PyUnicode_HASH(unicode) = -1;
688 _PyUnicode_STATE(unicode).interned = 0;
689 _PyUnicode_STATE(unicode).kind = 0;
690 _PyUnicode_STATE(unicode).compact = 0;
691 _PyUnicode_STATE(unicode).ready = 0;
692 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200693 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200694 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200695 _PyUnicode_UTF8(unicode) = NULL;
696 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000698
Benjamin Peterson29060642009-01-31 22:14:21 +0000699 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000700 /* XXX UNREF/NEWREF interface should be more symmetrical */
701 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000702 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000703 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705}
706
Victor Stinnerf42dc442011-10-02 23:33:16 +0200707static const char*
708unicode_kind_name(PyObject *unicode)
709{
Victor Stinner42dfd712011-10-03 14:41:45 +0200710 /* don't check consistency: unicode_kind_name() is called from
711 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200712 if (!PyUnicode_IS_COMPACT(unicode))
713 {
714 if (!PyUnicode_IS_READY(unicode))
715 return "wstr";
716 switch(PyUnicode_KIND(unicode))
717 {
718 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200719 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200720 return "legacy ascii";
721 else
722 return "legacy latin1";
723 case PyUnicode_2BYTE_KIND:
724 return "legacy UCS2";
725 case PyUnicode_4BYTE_KIND:
726 return "legacy UCS4";
727 default:
728 return "<legacy invalid kind>";
729 }
730 }
731 assert(PyUnicode_IS_READY(unicode));
732 switch(PyUnicode_KIND(unicode))
733 {
734 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200735 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200736 return "ascii";
737 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200740 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200741 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200742 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200743 default:
744 return "<invalid compact kind>";
745 }
746}
747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748#ifdef Py_DEBUG
749int unicode_new_new_calls = 0;
750
751/* Functions wrapping macros for use in debugger */
752char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200753 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754}
755
756void *_PyUnicode_compact_data(void *unicode) {
757 return _PyUnicode_COMPACT_DATA(unicode);
758}
759void *_PyUnicode_data(void *unicode){
760 printf("obj %p\n", unicode);
761 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
762 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
763 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
764 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
765 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
766 return PyUnicode_DATA(unicode);
767}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200768
769void
770_PyUnicode_Dump(PyObject *op)
771{
772 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200773 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
774 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
775 void *data;
776 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
777 if (ascii->state.compact)
778 data = (compact + 1);
779 else
780 data = unicode->data.any;
781 if (ascii->wstr == data)
782 printf("shared ");
783 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200784 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200785 printf(" (%zu), ", compact->wstr_length);
786 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
787 printf("shared ");
788 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200789 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200790 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200791}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200792#endif
793
794PyObject *
795PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
796{
797 PyObject *obj;
798 PyCompactUnicodeObject *unicode;
799 void *data;
800 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200801 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 Py_ssize_t char_size;
803 Py_ssize_t struct_size;
804
805 /* Optimization for empty strings */
806 if (size == 0 && unicode_empty != NULL) {
807 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200808 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 }
810
811#ifdef Py_DEBUG
812 ++unicode_new_new_calls;
813#endif
814
Victor Stinner9e9d6892011-10-04 01:02:02 +0200815 is_ascii = 0;
816 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 struct_size = sizeof(PyCompactUnicodeObject);
818 if (maxchar < 128) {
819 kind_state = PyUnicode_1BYTE_KIND;
820 char_size = 1;
821 is_ascii = 1;
822 struct_size = sizeof(PyASCIIObject);
823 }
824 else if (maxchar < 256) {
825 kind_state = PyUnicode_1BYTE_KIND;
826 char_size = 1;
827 }
828 else if (maxchar < 65536) {
829 kind_state = PyUnicode_2BYTE_KIND;
830 char_size = 2;
831 if (sizeof(wchar_t) == 2)
832 is_sharing = 1;
833 }
834 else {
835 kind_state = PyUnicode_4BYTE_KIND;
836 char_size = 4;
837 if (sizeof(wchar_t) == 4)
838 is_sharing = 1;
839 }
840
841 /* Ensure we won't overflow the size. */
842 if (size < 0) {
843 PyErr_SetString(PyExc_SystemError,
844 "Negative size passed to PyUnicode_New");
845 return NULL;
846 }
847 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
848 return PyErr_NoMemory();
849
850 /* Duplicated allocation code from _PyObject_New() instead of a call to
851 * PyObject_New() so we are able to allocate space for the object and
852 * it's data buffer.
853 */
854 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
855 if (obj == NULL)
856 return PyErr_NoMemory();
857 obj = PyObject_INIT(obj, &PyUnicode_Type);
858 if (obj == NULL)
859 return NULL;
860
861 unicode = (PyCompactUnicodeObject *)obj;
862 if (is_ascii)
863 data = ((PyASCIIObject*)obj) + 1;
864 else
865 data = unicode + 1;
866 _PyUnicode_LENGTH(unicode) = size;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = kind_state;
870 _PyUnicode_STATE(unicode).compact = 1;
871 _PyUnicode_STATE(unicode).ready = 1;
872 _PyUnicode_STATE(unicode).ascii = is_ascii;
873 if (is_ascii) {
874 ((char*)data)[size] = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 else if (kind_state == PyUnicode_1BYTE_KIND) {
878 ((char*)data)[size] = 0;
879 _PyUnicode_WSTR(unicode) = NULL;
880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200882 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 }
884 else {
885 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200886 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 if (kind_state == PyUnicode_2BYTE_KIND)
888 ((Py_UCS2*)data)[size] = 0;
889 else /* kind_state == PyUnicode_4BYTE_KIND */
890 ((Py_UCS4*)data)[size] = 0;
891 if (is_sharing) {
892 _PyUnicode_WSTR_LENGTH(unicode) = size;
893 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
894 }
895 else {
896 _PyUnicode_WSTR_LENGTH(unicode) = 0;
897 _PyUnicode_WSTR(unicode) = NULL;
898 }
899 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200900 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 return obj;
902}
903
904#if SIZEOF_WCHAR_T == 2
905/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
906 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200907 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909 This function assumes that unicode can hold one more code point than wstr
910 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200911static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
913 PyUnicodeObject *unicode)
914{
915 const wchar_t *iter;
916 Py_UCS4 *ucs4_out;
917
Victor Stinner910337b2011-10-03 03:20:16 +0200918 assert(unicode != NULL);
919 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
921 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
922
923 for (iter = begin; iter < end; ) {
924 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
925 _PyUnicode_GET_LENGTH(unicode)));
926 if (*iter >= 0xD800 && *iter <= 0xDBFF
927 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
928 {
929 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
930 iter += 2;
931 }
932 else {
933 *ucs4_out++ = *iter;
934 iter++;
935 }
936 }
937 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
938 _PyUnicode_GET_LENGTH(unicode)));
939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940}
941#endif
942
Victor Stinnercd9950f2011-10-02 00:34:53 +0200943static int
944_PyUnicode_Dirty(PyObject *unicode)
945{
Victor Stinner910337b2011-10-03 03:20:16 +0200946 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200947 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200948 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949 "Cannot modify a string having more than 1 reference");
950 return -1;
951 }
952 _PyUnicode_DIRTY(unicode);
953 return 0;
954}
955
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200956Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
958 PyObject *from, Py_ssize_t from_start,
959 Py_ssize_t how_many)
960{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 unsigned int from_kind, to_kind;
962 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963
Victor Stinnerb1536152011-09-30 02:26:10 +0200964 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
965 PyErr_BadInternalCall();
966 return -1;
967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968
969 if (PyUnicode_READY(from))
970 return -1;
971 if (PyUnicode_READY(to))
972 return -1;
973
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200974 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200976 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 "Cannot write %zi characters at %zi "
978 "in a string of %zi characters",
979 how_many, to_start, PyUnicode_GET_LENGTH(to));
980 return -1;
981 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200982 if (how_many == 0)
983 return 0;
984
Victor Stinnercd9950f2011-10-02 00:34:53 +0200985 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200986 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200989 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200991 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992
Victor Stinnerf42dc442011-10-02 23:33:16 +0200993 if (from_kind == to_kind
994 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200995 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200996 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200997 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200998 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200999 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001000 + PyUnicode_KIND_SIZE(from_kind, from_start),
1001 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001003 else if (from_kind == PyUnicode_1BYTE_KIND
1004 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001005 {
1006 _PyUnicode_CONVERT_BYTES(
1007 Py_UCS1, Py_UCS2,
1008 PyUnicode_1BYTE_DATA(from) + from_start,
1009 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1010 PyUnicode_2BYTE_DATA(to) + to_start
1011 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001012 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001013 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001014 && to_kind == PyUnicode_4BYTE_KIND)
1015 {
1016 _PyUnicode_CONVERT_BYTES(
1017 Py_UCS1, Py_UCS4,
1018 PyUnicode_1BYTE_DATA(from) + from_start,
1019 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1020 PyUnicode_4BYTE_DATA(to) + to_start
1021 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001022 }
1023 else if (from_kind == PyUnicode_2BYTE_KIND
1024 && to_kind == PyUnicode_4BYTE_KIND)
1025 {
1026 _PyUnicode_CONVERT_BYTES(
1027 Py_UCS2, Py_UCS4,
1028 PyUnicode_2BYTE_DATA(from) + from_start,
1029 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1030 PyUnicode_4BYTE_DATA(to) + to_start
1031 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001032 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001033 else {
1034 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001035
1036 /* check if max_char(from substring) <= max_char(to) */
1037 if (from_kind > to_kind
1038 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001039 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001040 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001041 /* slow path to check for character overflow */
1042 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1043 Py_UCS4 ch, maxchar;
1044 Py_ssize_t i;
1045
1046 maxchar = 0;
1047 invalid_kinds = 0;
1048 for (i=0; i < how_many; i++) {
1049 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1050 if (ch > maxchar) {
1051 maxchar = ch;
1052 if (maxchar > to_maxchar) {
1053 invalid_kinds = 1;
1054 break;
1055 }
1056 }
1057 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1058 }
1059 }
1060 else
1061 invalid_kinds = 1;
1062 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001063 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001064 "Cannot copy %s characters "
1065 "into a string of %s characters",
1066 unicode_kind_name(from),
1067 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 return -1;
1069 }
1070 }
1071 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072}
1073
Victor Stinner17222162011-09-28 22:15:37 +02001074/* Find the maximum code point and count the number of surrogate pairs so a
1075 correct string length can be computed before converting a string to UCS4.
1076 This function counts single surrogates as a character and not as a pair.
1077
1078 Return 0 on success, or -1 on error. */
1079static int
1080find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1081 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082{
1083 const wchar_t *iter;
1084
Victor Stinnerc53be962011-10-02 21:33:54 +02001085 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 *num_surrogates = 0;
1087 *maxchar = 0;
1088
1089 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001090 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001092#if SIZEOF_WCHAR_T != 2
1093 if (*maxchar >= 0x10000)
1094 return 0;
1095#endif
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097#if SIZEOF_WCHAR_T == 2
1098 if (*iter >= 0xD800 && *iter <= 0xDBFF
1099 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1100 {
1101 Py_UCS4 surrogate_val;
1102 surrogate_val = (((iter[0] & 0x3FF)<<10)
1103 | (iter[1] & 0x3FF)) + 0x10000;
1104 ++(*num_surrogates);
1105 if (surrogate_val > *maxchar)
1106 *maxchar = surrogate_val;
1107 iter += 2;
1108 }
1109 else
1110 iter++;
1111#else
1112 iter++;
1113#endif
1114 }
1115 return 0;
1116}
1117
1118#ifdef Py_DEBUG
1119int unicode_ready_calls = 0;
1120#endif
1121
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001122static int
1123unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001125 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 wchar_t *end;
1127 Py_UCS4 maxchar = 0;
1128 Py_ssize_t num_surrogates;
1129#if SIZEOF_WCHAR_T == 2
1130 Py_ssize_t length_wo_surrogates;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133 assert(p_obj != NULL);
1134 unicode = (PyUnicodeObject *)*p_obj;
1135
Georg Brandl7597add2011-10-05 16:36:47 +02001136 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001137 strings were created using _PyObject_New() and where no canonical
1138 representation (the str field) has been set yet aka strings
1139 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001140 assert(_PyUnicode_CHECK(unicode));
1141 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001143 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001144 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001145 /* Actually, it should neither be interned nor be anything else: */
1146 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148#ifdef Py_DEBUG
1149 ++unicode_ready_calls;
1150#endif
1151
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001152#ifdef Py_DEBUG
1153 assert(!replace || Py_REFCNT(unicode) == 1);
1154#else
1155 if (replace && Py_REFCNT(unicode) != 1)
1156 replace = 0;
1157#endif
1158 if (replace) {
1159 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1160 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1161 /* Optimization for empty strings */
1162 if (len == 0) {
1163 Py_INCREF(unicode_empty);
1164 Py_DECREF(*p_obj);
1165 *p_obj = unicode_empty;
1166 return 0;
1167 }
1168 if (len == 1 && wstr[0] < 256) {
1169 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1170 if (latin1_char == NULL)
1171 return -1;
1172 Py_DECREF(*p_obj);
1173 *p_obj = latin1_char;
1174 return 0;
1175 }
1176 }
1177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001179 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001180 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182
1183 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001184 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1185 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186 PyErr_NoMemory();
1187 return -1;
1188 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001189 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 _PyUnicode_WSTR(unicode), end,
1191 PyUnicode_1BYTE_DATA(unicode));
1192 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1193 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1194 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1195 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001196 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001197 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001198 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 }
1200 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 PyObject_FREE(_PyUnicode_WSTR(unicode));
1206 _PyUnicode_WSTR(unicode) = NULL;
1207 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1208 }
1209 /* In this case we might have to convert down from 4-byte native
1210 wchar_t to 2-byte unicode. */
1211 else if (maxchar < 65536) {
1212 assert(num_surrogates == 0 &&
1213 "FindMaxCharAndNumSurrogatePairs() messed up");
1214
Victor Stinner506f5922011-09-28 22:34:18 +02001215#if SIZEOF_WCHAR_T == 2
1216 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001217 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001218 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1219 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1220 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001223#else
1224 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001225 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001226 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001227 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001228 PyErr_NoMemory();
1229 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 }
Victor Stinner506f5922011-09-28 22:34:18 +02001231 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1232 _PyUnicode_WSTR(unicode), end,
1233 PyUnicode_2BYTE_DATA(unicode));
1234 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1235 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1236 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001237 _PyUnicode_UTF8(unicode) = NULL;
1238 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001239 PyObject_FREE(_PyUnicode_WSTR(unicode));
1240 _PyUnicode_WSTR(unicode) = NULL;
1241 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 }
1244 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1245 else {
1246#if SIZEOF_WCHAR_T == 2
1247 /* in case the native representation is 2-bytes, we need to allocate a
1248 new normalized 4-byte version. */
1249 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001250 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1251 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 PyErr_NoMemory();
1253 return -1;
1254 }
1255 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1256 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001257 _PyUnicode_UTF8(unicode) = NULL;
1258 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001259 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1260 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001261 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 PyObject_FREE(_PyUnicode_WSTR(unicode));
1263 _PyUnicode_WSTR(unicode) = NULL;
1264 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1265#else
1266 assert(num_surrogates == 0);
1267
Victor Stinnerc3c74152011-10-02 20:39:55 +02001268 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001270 _PyUnicode_UTF8(unicode) = NULL;
1271 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1273#endif
1274 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1275 }
1276 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001277 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 return 0;
1279}
1280
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001281int
1282_PyUnicode_ReadyReplace(PyObject **op)
1283{
1284 return unicode_ready(op, 1);
1285}
1286
1287int
1288_PyUnicode_Ready(PyObject *op)
1289{
1290 return unicode_ready(&op, 0);
1291}
1292
Alexander Belopolsky40018472011-02-26 01:02:56 +00001293static void
1294unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
Walter Dörwald16807132007-05-25 13:52:07 +00001296 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 case SSTATE_NOT_INTERNED:
1298 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001299
Benjamin Peterson29060642009-01-31 22:14:21 +00001300 case SSTATE_INTERNED_MORTAL:
1301 /* revive dead object temporarily for DelItem */
1302 Py_REFCNT(unicode) = 3;
1303 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1304 Py_FatalError(
1305 "deletion of interned string failed");
1306 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001307
Benjamin Peterson29060642009-01-31 22:14:21 +00001308 case SSTATE_INTERNED_IMMORTAL:
1309 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001310
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 default:
1312 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001313 }
1314
Victor Stinner03490912011-10-03 23:45:12 +02001315 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001317 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001318 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319
1320 if (PyUnicode_IS_COMPACT(unicode)) {
1321 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 }
1323 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001324 if (_PyUnicode_DATA_ANY(unicode))
1325 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001326 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 }
1328}
1329
Alexander Belopolsky40018472011-02-26 01:02:56 +00001330static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001331unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001332{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001333 if (Py_REFCNT(unicode) != 1)
1334 return 0;
1335 if (PyUnicode_CHECK_INTERNED(unicode))
1336 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001337 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001338#ifdef Py_DEBUG
1339 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1340 && PyUnicode_GET_LENGTH(unicode) == 1)
1341 {
1342 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001343 if (ch < 256 && unicode_latin1[ch] == unicode)
1344 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001345 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001346#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001347 return 1;
1348}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350static int
1351unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1352{
1353 PyObject *unicode;
1354 Py_ssize_t old_length;
1355
1356 assert(p_unicode != NULL);
1357 unicode = *p_unicode;
1358
1359 assert(unicode != NULL);
1360 assert(PyUnicode_Check(unicode));
1361 assert(0 <= length);
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 old_length = PyUnicode_WSTR_LENGTH(unicode);
1365 else
1366 old_length = PyUnicode_GET_LENGTH(unicode);
1367 if (old_length == length)
1368 return 0;
1369
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370 if (!unicode_resizable(unicode)) {
1371 PyObject *copy = resize_copy(unicode, length);
1372 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001374 Py_DECREF(*p_unicode);
1375 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001377 }
1378
Victor Stinnerfe226c02011-10-03 03:52:20 +02001379 if (PyUnicode_IS_COMPACT(unicode)) {
1380 *p_unicode = resize_compact(unicode, length);
1381 if (*p_unicode == NULL)
1382 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001383 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001384 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001385 }
1386 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001387}
1388
Alexander Belopolsky40018472011-02-26 01:02:56 +00001389int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001390PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001391{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001392 PyObject *unicode;
1393 if (p_unicode == NULL) {
1394 PyErr_BadInternalCall();
1395 return -1;
1396 }
1397 unicode = *p_unicode;
1398 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1399 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1400 {
1401 PyErr_BadInternalCall();
1402 return -1;
1403 }
1404 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001405}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407static PyObject*
1408get_latin1_char(unsigned char ch)
1409{
Victor Stinnera464fc12011-10-02 20:39:30 +02001410 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001412 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 if (!unicode)
1414 return NULL;
1415 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001416 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 unicode_latin1[ch] = unicode;
1418 }
1419 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001420 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421}
1422
Alexander Belopolsky40018472011-02-26 01:02:56 +00001423PyObject *
1424PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425{
1426 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 Py_UCS4 maxchar = 0;
1428 Py_ssize_t num_surrogates;
1429
1430 if (u == NULL)
1431 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001433 /* If the Unicode data is known at construction time, we can apply
1434 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 /* Optimization for empty strings */
1437 if (size == 0 && unicode_empty != NULL) {
1438 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001439 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001440 }
Tim Petersced69f82003-09-16 20:30:58 +00001441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 /* Single character Unicode objects in the Latin-1 range are
1443 shared when using this constructor */
1444 if (size == 1 && *u < 256)
1445 return get_latin1_char((unsigned char)*u);
1446
1447 /* If not empty and not single character, copy the Unicode data
1448 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001449 if (find_maxchar_surrogates(u, u + size,
1450 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 return NULL;
1452
1453 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1454 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 if (!unicode)
1456 return NULL;
1457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 switch (PyUnicode_KIND(unicode)) {
1459 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001460 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1462 break;
1463 case PyUnicode_2BYTE_KIND:
1464#if Py_UNICODE_SIZE == 2
1465 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1466#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001467 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1469#endif
1470 break;
1471 case PyUnicode_4BYTE_KIND:
1472#if SIZEOF_WCHAR_T == 2
1473 /* This is the only case which has to process surrogates, thus
1474 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001475 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476#else
1477 assert(num_surrogates == 0);
1478 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1479#endif
1480 break;
1481 default:
1482 assert(0 && "Impossible state");
1483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001485 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 return (PyObject *)unicode;
1487}
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489PyObject *
1490PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001491{
1492 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001493
Benjamin Peterson14339b62009-01-31 16:36:08 +00001494 if (size < 0) {
1495 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001497 return NULL;
1498 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001499
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001501 some optimizations which share commonly used objects.
1502 Also, this means the input must be UTF-8, so fall back to the
1503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001504 if (u != NULL) {
1505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 /* Optimization for empty strings */
1507 if (size == 0 && unicode_empty != NULL) {
1508 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001509 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001511
1512 /* Single characters are shared when using this constructor.
1513 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 if (size == 1 && Py_CHARMASK(*u) < 128)
1515 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001516
1517 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001518 }
1519
Walter Dörwald55507312007-05-18 13:12:10 +00001520 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001521 if (!unicode)
1522 return NULL;
1523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001524 return (PyObject *)unicode;
1525}
1526
Alexander Belopolsky40018472011-02-26 01:02:56 +00001527PyObject *
1528PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001529{
1530 size_t size = strlen(u);
1531 if (size > PY_SSIZE_T_MAX) {
1532 PyErr_SetString(PyExc_OverflowError, "input too long");
1533 return NULL;
1534 }
1535
1536 return PyUnicode_FromStringAndSize(u, size);
1537}
1538
Victor Stinnere57b1c02011-09-28 22:20:48 +02001539static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001540unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1541{
1542 PyObject *res = PyUnicode_New(size, 127);
1543 if (!res)
1544 return NULL;
1545 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1546 return res;
1547}
1548
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001549static Py_UCS4
1550kind_maxchar_limit(unsigned int kind)
1551{
1552 switch(kind) {
1553 case PyUnicode_1BYTE_KIND:
1554 return 0x80;
1555 case PyUnicode_2BYTE_KIND:
1556 return 0x100;
1557 case PyUnicode_4BYTE_KIND:
1558 return 0x10000;
1559 default:
1560 assert(0 && "invalid kind");
1561 return 0x10ffff;
1562 }
1563}
1564
Victor Stinner702c7342011-10-05 13:50:52 +02001565static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001566_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001569 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001571
1572 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 for (i = 0; i < size; i++) {
1574 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001575 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001577 }
1578 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001579 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 if (!res)
1581 return NULL;
1582 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001583 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001585}
1586
Victor Stinnere57b1c02011-09-28 22:20:48 +02001587static PyObject*
1588_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589{
1590 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001591 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001593
1594 assert(size >= 0);
1595 for (i = 0; i < size; i++) {
1596 if (u[i] > max_char) {
1597 max_char = u[i];
1598 if (max_char >= 256)
1599 break;
1600 }
1601 }
1602 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 if (!res)
1604 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001605 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1607 else
1608 for (i = 0; i < size; i++)
1609 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001610 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 return res;
1612}
1613
Victor Stinnere57b1c02011-09-28 22:20:48 +02001614static PyObject*
1615_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616{
1617 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001618 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001620
1621 assert(size >= 0);
1622 for (i = 0; i < size; i++) {
1623 if (u[i] > max_char) {
1624 max_char = u[i];
1625 if (max_char >= 0x10000)
1626 break;
1627 }
1628 }
1629 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001630 if (!res)
1631 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001632 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1634 else {
1635 int kind = PyUnicode_KIND(res);
1636 void *data = PyUnicode_DATA(res);
1637 for (i = 0; i < size; i++)
1638 PyUnicode_WRITE(kind, data, i, u[i]);
1639 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001640 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 return res;
1642}
1643
1644PyObject*
1645PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1646{
1647 switch(kind) {
1648 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001649 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001651 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001653 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001654 default:
1655 assert(0 && "invalid kind");
1656 PyErr_SetString(PyExc_SystemError, "invalid kind");
1657 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659}
1660
Victor Stinner034f6cf2011-09-30 02:26:44 +02001661PyObject*
1662PyUnicode_Copy(PyObject *unicode)
1663{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001664 Py_ssize_t size;
1665 PyObject *copy;
1666 void *data;
1667
Victor Stinner034f6cf2011-09-30 02:26:44 +02001668 if (!PyUnicode_Check(unicode)) {
1669 PyErr_BadInternalCall();
1670 return NULL;
1671 }
1672 if (PyUnicode_READY(unicode))
1673 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001674
1675 size = PyUnicode_GET_LENGTH(unicode);
1676 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1677 if (!copy)
1678 return NULL;
1679 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1680
1681 data = PyUnicode_DATA(unicode);
1682 switch (PyUnicode_KIND(unicode))
1683 {
1684 case PyUnicode_1BYTE_KIND:
1685 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1686 break;
1687 case PyUnicode_2BYTE_KIND:
1688 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1689 break;
1690 case PyUnicode_4BYTE_KIND:
1691 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1692 break;
1693 default:
1694 assert(0);
1695 break;
1696 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001697 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001698 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001699}
1700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701
Victor Stinnerbc603d12011-10-02 01:00:40 +02001702/* Widen Unicode objects to larger buffers. Don't write terminating null
1703 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704
1705void*
1706_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1707{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001708 Py_ssize_t len;
1709 void *result;
1710 unsigned int skind;
1711
1712 if (PyUnicode_READY(s))
1713 return NULL;
1714
1715 len = PyUnicode_GET_LENGTH(s);
1716 skind = PyUnicode_KIND(s);
1717 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001718 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 return NULL;
1720 }
1721 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001722 case PyUnicode_2BYTE_KIND:
1723 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1724 if (!result)
1725 return PyErr_NoMemory();
1726 assert(skind == PyUnicode_1BYTE_KIND);
1727 _PyUnicode_CONVERT_BYTES(
1728 Py_UCS1, Py_UCS2,
1729 PyUnicode_1BYTE_DATA(s),
1730 PyUnicode_1BYTE_DATA(s) + len,
1731 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001733 case PyUnicode_4BYTE_KIND:
1734 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1735 if (!result)
1736 return PyErr_NoMemory();
1737 if (skind == PyUnicode_2BYTE_KIND) {
1738 _PyUnicode_CONVERT_BYTES(
1739 Py_UCS2, Py_UCS4,
1740 PyUnicode_2BYTE_DATA(s),
1741 PyUnicode_2BYTE_DATA(s) + len,
1742 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001744 else {
1745 assert(skind == PyUnicode_1BYTE_KIND);
1746 _PyUnicode_CONVERT_BYTES(
1747 Py_UCS1, Py_UCS4,
1748 PyUnicode_1BYTE_DATA(s),
1749 PyUnicode_1BYTE_DATA(s) + len,
1750 result);
1751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001753 default:
1754 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 }
Victor Stinner01698042011-10-04 00:04:26 +02001756 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 return NULL;
1758}
1759
1760static Py_UCS4*
1761as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1762 int copy_null)
1763{
1764 int kind;
1765 void *data;
1766 Py_ssize_t len, targetlen;
1767 if (PyUnicode_READY(string) == -1)
1768 return NULL;
1769 kind = PyUnicode_KIND(string);
1770 data = PyUnicode_DATA(string);
1771 len = PyUnicode_GET_LENGTH(string);
1772 targetlen = len;
1773 if (copy_null)
1774 targetlen++;
1775 if (!target) {
1776 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1777 PyErr_NoMemory();
1778 return NULL;
1779 }
1780 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1781 if (!target) {
1782 PyErr_NoMemory();
1783 return NULL;
1784 }
1785 }
1786 else {
1787 if (targetsize < targetlen) {
1788 PyErr_Format(PyExc_SystemError,
1789 "string is longer than the buffer");
1790 if (copy_null && 0 < targetsize)
1791 target[0] = 0;
1792 return NULL;
1793 }
1794 }
1795 if (kind != PyUnicode_4BYTE_KIND) {
1796 Py_ssize_t i;
1797 for (i = 0; i < len; i++)
1798 target[i] = PyUnicode_READ(kind, data, i);
1799 }
1800 else
1801 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1802 if (copy_null)
1803 target[len] = 0;
1804 return target;
1805}
1806
1807Py_UCS4*
1808PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1809 int copy_null)
1810{
1811 if (target == NULL || targetsize < 1) {
1812 PyErr_BadInternalCall();
1813 return NULL;
1814 }
1815 return as_ucs4(string, target, targetsize, copy_null);
1816}
1817
1818Py_UCS4*
1819PyUnicode_AsUCS4Copy(PyObject *string)
1820{
1821 return as_ucs4(string, NULL, 0, 1);
1822}
1823
1824#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001825
Alexander Belopolsky40018472011-02-26 01:02:56 +00001826PyObject *
1827PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001830 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001832 PyErr_BadInternalCall();
1833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 }
1835
Martin v. Löwis790465f2008-04-05 20:41:37 +00001836 if (size == -1) {
1837 size = wcslen(w);
1838 }
1839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001841}
1842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001844
Walter Dörwald346737f2007-05-31 10:44:43 +00001845static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001846makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1847 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001849 *fmt++ = '%';
1850 if (width) {
1851 if (zeropad)
1852 *fmt++ = '0';
1853 fmt += sprintf(fmt, "%d", width);
1854 }
1855 if (precision)
1856 fmt += sprintf(fmt, ".%d", precision);
1857 if (longflag)
1858 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001859 else if (longlongflag) {
1860 /* longlongflag should only ever be nonzero on machines with
1861 HAVE_LONG_LONG defined */
1862#ifdef HAVE_LONG_LONG
1863 char *f = PY_FORMAT_LONG_LONG;
1864 while (*f)
1865 *fmt++ = *f++;
1866#else
1867 /* we shouldn't ever get here */
1868 assert(0);
1869 *fmt++ = 'l';
1870#endif
1871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001872 else if (size_tflag) {
1873 char *f = PY_FORMAT_SIZE_T;
1874 while (*f)
1875 *fmt++ = *f++;
1876 }
1877 *fmt++ = c;
1878 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001879}
1880
Victor Stinner96865452011-03-01 23:44:09 +00001881/* helper for PyUnicode_FromFormatV() */
1882
1883static const char*
1884parse_format_flags(const char *f,
1885 int *p_width, int *p_precision,
1886 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1887{
1888 int width, precision, longflag, longlongflag, size_tflag;
1889
1890 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1891 f++;
1892 width = 0;
1893 while (Py_ISDIGIT((unsigned)*f))
1894 width = (width*10) + *f++ - '0';
1895 precision = 0;
1896 if (*f == '.') {
1897 f++;
1898 while (Py_ISDIGIT((unsigned)*f))
1899 precision = (precision*10) + *f++ - '0';
1900 if (*f == '%') {
1901 /* "%.3%s" => f points to "3" */
1902 f--;
1903 }
1904 }
1905 if (*f == '\0') {
1906 /* bogus format "%.1" => go backward, f points to "1" */
1907 f--;
1908 }
1909 if (p_width != NULL)
1910 *p_width = width;
1911 if (p_precision != NULL)
1912 *p_precision = precision;
1913
1914 /* Handle %ld, %lu, %lld and %llu. */
1915 longflag = 0;
1916 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001917 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001918
1919 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001920 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001921 longflag = 1;
1922 ++f;
1923 }
1924#ifdef HAVE_LONG_LONG
1925 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001926 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001927 longlongflag = 1;
1928 f += 2;
1929 }
1930#endif
1931 }
1932 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001933 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001934 size_tflag = 1;
1935 ++f;
1936 }
1937 if (p_longflag != NULL)
1938 *p_longflag = longflag;
1939 if (p_longlongflag != NULL)
1940 *p_longlongflag = longlongflag;
1941 if (p_size_tflag != NULL)
1942 *p_size_tflag = size_tflag;
1943 return f;
1944}
1945
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001946/* maximum number of characters required for output of %ld. 21 characters
1947 allows for 64-bit integers (in decimal) and an optional sign. */
1948#define MAX_LONG_CHARS 21
1949/* maximum number of characters required for output of %lld.
1950 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1951 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1952#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1953
Walter Dörwaldd2034312007-05-18 16:29:38 +00001954PyObject *
1955PyUnicode_FromFormatV(const char *format, va_list vargs)
1956{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001957 va_list count;
1958 Py_ssize_t callcount = 0;
1959 PyObject **callresults = NULL;
1960 PyObject **callresult = NULL;
1961 Py_ssize_t n = 0;
1962 int width = 0;
1963 int precision = 0;
1964 int zeropad;
1965 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001966 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001967 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001968 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1970 Py_UCS4 argmaxchar;
1971 Py_ssize_t numbersize = 0;
1972 char *numberresults = NULL;
1973 char *numberresult = NULL;
1974 Py_ssize_t i;
1975 int kind;
1976 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001977
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001978 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001979 /* step 1: count the number of %S/%R/%A/%s format specifications
1980 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1981 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02001983 * also estimate a upper bound for all the number formats in the string,
1984 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001986 for (f = format; *f; f++) {
1987 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001988 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1990 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1991 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1992 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001995#ifdef HAVE_LONG_LONG
1996 if (longlongflag) {
1997 if (width < MAX_LONG_LONG_CHARS)
1998 width = MAX_LONG_LONG_CHARS;
1999 }
2000 else
2001#endif
2002 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2003 including sign. Decimal takes the most space. This
2004 isn't enough for octal. If a width is specified we
2005 need more (which we allocate later). */
2006 if (width < MAX_LONG_CHARS)
2007 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008
2009 /* account for the size + '\0' to separate numbers
2010 inside of the numberresults buffer */
2011 numbersize += (width + 1);
2012 }
2013 }
2014 else if ((unsigned char)*f > 127) {
2015 PyErr_Format(PyExc_ValueError,
2016 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2017 "string, got a non-ASCII byte: 0x%02x",
2018 (unsigned char)*f);
2019 return NULL;
2020 }
2021 }
2022 /* step 2: allocate memory for the results of
2023 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2024 if (callcount) {
2025 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2026 if (!callresults) {
2027 PyErr_NoMemory();
2028 return NULL;
2029 }
2030 callresult = callresults;
2031 }
2032 /* step 2.5: allocate memory for the results of formating numbers */
2033 if (numbersize) {
2034 numberresults = PyObject_Malloc(numbersize);
2035 if (!numberresults) {
2036 PyErr_NoMemory();
2037 goto fail;
2038 }
2039 numberresult = numberresults;
2040 }
2041
2042 /* step 3: format numbers and figure out how large a buffer we need */
2043 for (f = format; *f; f++) {
2044 if (*f == '%') {
2045 const char* p;
2046 int longflag;
2047 int longlongflag;
2048 int size_tflag;
2049 int numprinted;
2050
2051 p = f;
2052 zeropad = (f[1] == '0');
2053 f = parse_format_flags(f, &width, &precision,
2054 &longflag, &longlongflag, &size_tflag);
2055 switch (*f) {
2056 case 'c':
2057 {
2058 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002059 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 n++;
2061 break;
2062 }
2063 case '%':
2064 n++;
2065 break;
2066 case 'i':
2067 case 'd':
2068 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2069 width, precision, *f);
2070 if (longflag)
2071 numprinted = sprintf(numberresult, fmt,
2072 va_arg(count, long));
2073#ifdef HAVE_LONG_LONG
2074 else if (longlongflag)
2075 numprinted = sprintf(numberresult, fmt,
2076 va_arg(count, PY_LONG_LONG));
2077#endif
2078 else if (size_tflag)
2079 numprinted = sprintf(numberresult, fmt,
2080 va_arg(count, Py_ssize_t));
2081 else
2082 numprinted = sprintf(numberresult, fmt,
2083 va_arg(count, int));
2084 n += numprinted;
2085 /* advance by +1 to skip over the '\0' */
2086 numberresult += (numprinted + 1);
2087 assert(*(numberresult - 1) == '\0');
2088 assert(*(numberresult - 2) != '\0');
2089 assert(numprinted >= 0);
2090 assert(numberresult <= numberresults + numbersize);
2091 break;
2092 case 'u':
2093 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2094 width, precision, 'u');
2095 if (longflag)
2096 numprinted = sprintf(numberresult, fmt,
2097 va_arg(count, unsigned long));
2098#ifdef HAVE_LONG_LONG
2099 else if (longlongflag)
2100 numprinted = sprintf(numberresult, fmt,
2101 va_arg(count, unsigned PY_LONG_LONG));
2102#endif
2103 else if (size_tflag)
2104 numprinted = sprintf(numberresult, fmt,
2105 va_arg(count, size_t));
2106 else
2107 numprinted = sprintf(numberresult, fmt,
2108 va_arg(count, unsigned int));
2109 n += numprinted;
2110 numberresult += (numprinted + 1);
2111 assert(*(numberresult - 1) == '\0');
2112 assert(*(numberresult - 2) != '\0');
2113 assert(numprinted >= 0);
2114 assert(numberresult <= numberresults + numbersize);
2115 break;
2116 case 'x':
2117 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2118 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2119 n += numprinted;
2120 numberresult += (numprinted + 1);
2121 assert(*(numberresult - 1) == '\0');
2122 assert(*(numberresult - 2) != '\0');
2123 assert(numprinted >= 0);
2124 assert(numberresult <= numberresults + numbersize);
2125 break;
2126 case 'p':
2127 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2128 /* %p is ill-defined: ensure leading 0x. */
2129 if (numberresult[1] == 'X')
2130 numberresult[1] = 'x';
2131 else if (numberresult[1] != 'x') {
2132 memmove(numberresult + 2, numberresult,
2133 strlen(numberresult) + 1);
2134 numberresult[0] = '0';
2135 numberresult[1] = 'x';
2136 numprinted += 2;
2137 }
2138 n += numprinted;
2139 numberresult += (numprinted + 1);
2140 assert(*(numberresult - 1) == '\0');
2141 assert(*(numberresult - 2) != '\0');
2142 assert(numprinted >= 0);
2143 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 break;
2145 case 's':
2146 {
2147 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002148 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002149 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2150 if (!str)
2151 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 /* since PyUnicode_DecodeUTF8 returns already flexible
2153 unicode objects, there is no need to call ready on them */
2154 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002155 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002156 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002157 /* Remember the str and switch to the next slot */
2158 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002159 break;
2160 }
2161 case 'U':
2162 {
2163 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002164 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 if (PyUnicode_READY(obj) == -1)
2166 goto fail;
2167 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002168 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002170 break;
2171 }
2172 case 'V':
2173 {
2174 PyObject *obj = va_arg(count, PyObject *);
2175 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002176 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002177 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002178 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002179 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 if (PyUnicode_READY(obj) == -1)
2181 goto fail;
2182 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002183 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002185 *callresult++ = NULL;
2186 }
2187 else {
2188 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2189 if (!str_obj)
2190 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002191 if (PyUnicode_READY(str_obj)) {
2192 Py_DECREF(str_obj);
2193 goto fail;
2194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002196 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002197 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002198 *callresult++ = str_obj;
2199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 break;
2201 }
2202 case 'S':
2203 {
2204 PyObject *obj = va_arg(count, PyObject *);
2205 PyObject *str;
2206 assert(obj);
2207 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002209 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002211 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002213 /* Remember the str and switch to the next slot */
2214 *callresult++ = str;
2215 break;
2216 }
2217 case 'R':
2218 {
2219 PyObject *obj = va_arg(count, PyObject *);
2220 PyObject *repr;
2221 assert(obj);
2222 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002226 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 /* Remember the repr and switch to the next slot */
2229 *callresult++ = repr;
2230 break;
2231 }
2232 case 'A':
2233 {
2234 PyObject *obj = va_arg(count, PyObject *);
2235 PyObject *ascii;
2236 assert(obj);
2237 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002239 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002240 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002241 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002243 /* Remember the repr and switch to the next slot */
2244 *callresult++ = ascii;
2245 break;
2246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 default:
2248 /* if we stumble upon an unknown
2249 formatting code, copy the rest of
2250 the format string to the output
2251 string. (we cannot just skip the
2252 code, since there's no way to know
2253 what's in the argument list) */
2254 n += strlen(p);
2255 goto expand;
2256 }
2257 } else
2258 n++;
2259 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002260 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002261 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002263 we don't have to resize the string.
2264 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 if (!string)
2267 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 kind = PyUnicode_KIND(string);
2269 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002275 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002276
2277 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2279 /* checking for == because the last argument could be a empty
2280 string, which causes i to point to end, the assert at the end of
2281 the loop */
2282 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002283
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 switch (*f) {
2285 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002286 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 const int ordinal = va_arg(vargs, int);
2288 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002289 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002290 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002291 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002294 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 case 'p':
2296 /* unused, since we already have the result */
2297 if (*f == 'p')
2298 (void) va_arg(vargs, void *);
2299 else
2300 (void) va_arg(vargs, int);
2301 /* extract the result from numberresults and append. */
2302 for (; *numberresult; ++i, ++numberresult)
2303 PyUnicode_WRITE(kind, data, i, *numberresult);
2304 /* skip over the separating '\0' */
2305 assert(*numberresult == '\0');
2306 numberresult++;
2307 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002308 break;
2309 case 's':
2310 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002311 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002313 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 size = PyUnicode_GET_LENGTH(*callresult);
2315 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002316 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2317 *callresult, 0,
2318 size) < 0)
2319 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002321 /* We're done with the unicode()/repr() => forget it */
2322 Py_DECREF(*callresult);
2323 /* switch to next unicode()/repr() result */
2324 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002325 break;
2326 }
2327 case 'U':
2328 {
2329 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330 Py_ssize_t size;
2331 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2332 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002333 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2334 obj, 0,
2335 size) < 0)
2336 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002338 break;
2339 }
2340 case 'V':
2341 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002343 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002344 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002346 size = PyUnicode_GET_LENGTH(obj);
2347 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002348 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2349 obj, 0,
2350 size) < 0)
2351 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 size = PyUnicode_GET_LENGTH(*callresult);
2355 assert(PyUnicode_KIND(*callresult) <=
2356 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002357 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2358 *callresult,
2359 0, size) < 0)
2360 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002362 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002363 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002364 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002365 break;
2366 }
2367 case 'S':
2368 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002369 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002370 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002371 /* unused, since we already have the result */
2372 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002374 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2375 *callresult, 0,
2376 PyUnicode_GET_LENGTH(*callresult)) < 0)
2377 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 /* We're done with the unicode()/repr() => forget it */
2380 Py_DECREF(*callresult);
2381 /* switch to next unicode()/repr() result */
2382 ++callresult;
2383 break;
2384 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 break;
2388 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 for (; *p; ++p, ++i)
2390 PyUnicode_WRITE(kind, data, i, *p);
2391 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002392 goto end;
2393 }
Victor Stinner1205f272010-09-11 00:54:47 +00002394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 else {
2396 assert(i < PyUnicode_GET_LENGTH(string));
2397 PyUnicode_WRITE(kind, data, i++, *f);
2398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002401
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 if (callresults)
2404 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 if (numberresults)
2406 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002407 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002409 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 if (callresults) {
2411 PyObject **callresult2 = callresults;
2412 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002413 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 ++callresult2;
2415 }
2416 PyObject_Free(callresults);
2417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 if (numberresults)
2419 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002421}
2422
Walter Dörwaldd2034312007-05-18 16:29:38 +00002423PyObject *
2424PyUnicode_FromFormat(const char *format, ...)
2425{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 PyObject* ret;
2427 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002428
2429#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002431#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002433#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 ret = PyUnicode_FromFormatV(format, vargs);
2435 va_end(vargs);
2436 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002437}
2438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439#ifdef HAVE_WCHAR_H
2440
Victor Stinner5593d8a2010-10-02 11:11:27 +00002441/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2442 convert a Unicode object to a wide character string.
2443
Victor Stinnerd88d9832011-09-06 02:00:05 +02002444 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002445 character) required to convert the unicode object. Ignore size argument.
2446
Victor Stinnerd88d9832011-09-06 02:00:05 +02002447 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002448 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002449 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002450static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002451unicode_aswidechar(PyUnicodeObject *unicode,
2452 wchar_t *w,
2453 Py_ssize_t size)
2454{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002455 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 const wchar_t *wstr;
2457
2458 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2459 if (wstr == NULL)
2460 return -1;
2461
Victor Stinner5593d8a2010-10-02 11:11:27 +00002462 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002463 if (size > res)
2464 size = res + 1;
2465 else
2466 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002468 return res;
2469 }
2470 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002472}
2473
2474Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002475PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002476 wchar_t *w,
2477 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478{
2479 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002480 PyErr_BadInternalCall();
2481 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002483 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484}
2485
Victor Stinner137c34c2010-09-29 10:25:54 +00002486wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002487PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002488 Py_ssize_t *size)
2489{
2490 wchar_t* buffer;
2491 Py_ssize_t buflen;
2492
2493 if (unicode == NULL) {
2494 PyErr_BadInternalCall();
2495 return NULL;
2496 }
2497
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002498 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 if (buflen == -1)
2500 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002501 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002502 PyErr_NoMemory();
2503 return NULL;
2504 }
2505
Victor Stinner137c34c2010-09-29 10:25:54 +00002506 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2507 if (buffer == NULL) {
2508 PyErr_NoMemory();
2509 return NULL;
2510 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002511 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 if (buflen == -1)
2513 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002514 if (size != NULL)
2515 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002516 return buffer;
2517}
2518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520
Alexander Belopolsky40018472011-02-26 01:02:56 +00002521PyObject *
2522PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002525 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002526 PyErr_SetString(PyExc_ValueError,
2527 "chr() arg not in range(0x110000)");
2528 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002529 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 if (ordinal < 256)
2532 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 v = PyUnicode_New(1, ordinal);
2535 if (v == NULL)
2536 return NULL;
2537 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002538 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002540}
2541
Alexander Belopolsky40018472011-02-26 01:02:56 +00002542PyObject *
2543PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002545 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002546 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002547 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002548 if (PyUnicode_READY(obj))
2549 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 Py_INCREF(obj);
2551 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002552 }
2553 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 /* For a Unicode subtype that's not a Unicode object,
2555 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002556 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002557 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002558 PyErr_Format(PyExc_TypeError,
2559 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002560 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002561 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002562}
2563
Alexander Belopolsky40018472011-02-26 01:02:56 +00002564PyObject *
2565PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002566 const char *encoding,
2567 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002568{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002569 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002570 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002571
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002573 PyErr_BadInternalCall();
2574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002576
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002577 /* Decoding bytes objects is the most common case and should be fast */
2578 if (PyBytes_Check(obj)) {
2579 if (PyBytes_GET_SIZE(obj) == 0) {
2580 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002581 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002582 }
2583 else {
2584 v = PyUnicode_Decode(
2585 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2586 encoding, errors);
2587 }
2588 return v;
2589 }
2590
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002591 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002592 PyErr_SetString(PyExc_TypeError,
2593 "decoding str is not supported");
2594 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002595 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002596
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002597 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2598 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2599 PyErr_Format(PyExc_TypeError,
2600 "coercing to str: need bytes, bytearray "
2601 "or buffer-like object, %.80s found",
2602 Py_TYPE(obj)->tp_name);
2603 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002604 }
Tim Petersced69f82003-09-16 20:30:58 +00002605
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002606 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002607 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002608 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
Tim Petersced69f82003-09-16 20:30:58 +00002610 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002611 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002612
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002613 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002614 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615}
2616
Victor Stinner600d3be2010-06-10 12:00:55 +00002617/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002618 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2619 1 on success. */
2620static int
2621normalize_encoding(const char *encoding,
2622 char *lower,
2623 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002625 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002626 char *l;
2627 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002628
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002629 e = encoding;
2630 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002631 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002632 while (*e) {
2633 if (l == l_end)
2634 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002635 if (Py_ISUPPER(*e)) {
2636 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002637 }
2638 else if (*e == '_') {
2639 *l++ = '-';
2640 e++;
2641 }
2642 else {
2643 *l++ = *e++;
2644 }
2645 }
2646 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002647 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002648}
2649
Alexander Belopolsky40018472011-02-26 01:02:56 +00002650PyObject *
2651PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002652 Py_ssize_t size,
2653 const char *encoding,
2654 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002655{
2656 PyObject *buffer = NULL, *unicode;
2657 Py_buffer info;
2658 char lower[11]; /* Enough for any encoding shortcut */
2659
2660 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002661 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002662
2663 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002664 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002665 if ((strcmp(lower, "utf-8") == 0) ||
2666 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002667 return PyUnicode_DecodeUTF8(s, size, errors);
2668 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002669 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002670 (strcmp(lower, "iso-8859-1") == 0))
2671 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002672#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002673 else if (strcmp(lower, "mbcs") == 0)
2674 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002675#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002676 else if (strcmp(lower, "ascii") == 0)
2677 return PyUnicode_DecodeASCII(s, size, errors);
2678 else if (strcmp(lower, "utf-16") == 0)
2679 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2680 else if (strcmp(lower, "utf-32") == 0)
2681 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002683
2684 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002685 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002686 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002687 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002688 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 if (buffer == NULL)
2690 goto onError;
2691 unicode = PyCodec_Decode(buffer, encoding, errors);
2692 if (unicode == NULL)
2693 goto onError;
2694 if (!PyUnicode_Check(unicode)) {
2695 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002696 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002697 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 Py_DECREF(unicode);
2699 goto onError;
2700 }
2701 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002702#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002703 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 Py_DECREF(unicode);
2705 return NULL;
2706 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002707#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002708 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002710
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 Py_XDECREF(buffer);
2713 return NULL;
2714}
2715
Alexander Belopolsky40018472011-02-26 01:02:56 +00002716PyObject *
2717PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002718 const char *encoding,
2719 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002720{
2721 PyObject *v;
2722
2723 if (!PyUnicode_Check(unicode)) {
2724 PyErr_BadArgument();
2725 goto onError;
2726 }
2727
2728 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002730
2731 /* Decode via the codec registry */
2732 v = PyCodec_Decode(unicode, encoding, errors);
2733 if (v == NULL)
2734 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002735 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002736 return v;
2737
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002739 return NULL;
2740}
2741
Alexander Belopolsky40018472011-02-26 01:02:56 +00002742PyObject *
2743PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002744 const char *encoding,
2745 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002746{
2747 PyObject *v;
2748
2749 if (!PyUnicode_Check(unicode)) {
2750 PyErr_BadArgument();
2751 goto onError;
2752 }
2753
2754 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002756
2757 /* Decode via the codec registry */
2758 v = PyCodec_Decode(unicode, encoding, errors);
2759 if (v == NULL)
2760 goto onError;
2761 if (!PyUnicode_Check(v)) {
2762 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002763 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002764 Py_TYPE(v)->tp_name);
2765 Py_DECREF(v);
2766 goto onError;
2767 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002768 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002769 return v;
2770
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002772 return NULL;
2773}
2774
Alexander Belopolsky40018472011-02-26 01:02:56 +00002775PyObject *
2776PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002777 Py_ssize_t size,
2778 const char *encoding,
2779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780{
2781 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002782
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 unicode = PyUnicode_FromUnicode(s, size);
2784 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2787 Py_DECREF(unicode);
2788 return v;
2789}
2790
Alexander Belopolsky40018472011-02-26 01:02:56 +00002791PyObject *
2792PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002793 const char *encoding,
2794 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002795{
2796 PyObject *v;
2797
2798 if (!PyUnicode_Check(unicode)) {
2799 PyErr_BadArgument();
2800 goto onError;
2801 }
2802
2803 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002805
2806 /* Encode via the codec registry */
2807 v = PyCodec_Encode(unicode, encoding, errors);
2808 if (v == NULL)
2809 goto onError;
2810 return v;
2811
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002813 return NULL;
2814}
2815
Victor Stinnerad158722010-10-27 00:25:46 +00002816PyObject *
2817PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002818{
Victor Stinner99b95382011-07-04 14:23:54 +02002819#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002825#else
Victor Stinner793b5312011-04-27 00:24:21 +02002826 PyInterpreterState *interp = PyThreadState_GET()->interp;
2827 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2828 cannot use it to encode and decode filenames before it is loaded. Load
2829 the Python codec requires to encode at least its own filename. Use the C
2830 version of the locale codec until the codec registry is initialized and
2831 the Python codec is loaded.
2832
2833 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2834 cannot only rely on it: check also interp->fscodec_initialized for
2835 subinterpreters. */
2836 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002837 return PyUnicode_AsEncodedString(unicode,
2838 Py_FileSystemDefaultEncoding,
2839 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002840 }
2841 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002842 /* locale encoding with surrogateescape */
2843 wchar_t *wchar;
2844 char *bytes;
2845 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002846 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002847
2848 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2849 if (wchar == NULL)
2850 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002851 bytes = _Py_wchar2char(wchar, &error_pos);
2852 if (bytes == NULL) {
2853 if (error_pos != (size_t)-1) {
2854 char *errmsg = strerror(errno);
2855 PyObject *exc = NULL;
2856 if (errmsg == NULL)
2857 errmsg = "Py_wchar2char() failed";
2858 raise_encode_exception(&exc,
2859 "filesystemencoding",
2860 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2861 error_pos, error_pos+1,
2862 errmsg);
2863 Py_XDECREF(exc);
2864 }
2865 else
2866 PyErr_NoMemory();
2867 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002868 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002869 }
2870 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002871
2872 bytes_obj = PyBytes_FromString(bytes);
2873 PyMem_Free(bytes);
2874 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002875 }
Victor Stinnerad158722010-10-27 00:25:46 +00002876#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002877}
2878
Alexander Belopolsky40018472011-02-26 01:02:56 +00002879PyObject *
2880PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002881 const char *encoding,
2882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883{
2884 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002885 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002886
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 if (!PyUnicode_Check(unicode)) {
2888 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002889 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 }
Fred Drakee4315f52000-05-09 19:53:39 +00002891
Victor Stinner2f283c22011-03-02 01:21:46 +00002892 if (encoding == NULL) {
2893 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002895 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002896 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002897 }
Fred Drakee4315f52000-05-09 19:53:39 +00002898
2899 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002900 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002901 if ((strcmp(lower, "utf-8") == 0) ||
2902 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002903 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002904 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002905 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002906 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002908 }
Victor Stinner37296e82010-06-10 13:36:23 +00002909 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002910 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002911 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002912 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002913#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002914 else if (strcmp(lower, "mbcs") == 0)
2915 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2916 PyUnicode_GET_SIZE(unicode),
2917 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002918#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002919 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922
2923 /* Encode via the codec registry */
2924 v = PyCodec_Encode(unicode, encoding, errors);
2925 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002926 return NULL;
2927
2928 /* The normal path */
2929 if (PyBytes_Check(v))
2930 return v;
2931
2932 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002933 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002934 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002935 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002936
2937 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2938 "encoder %s returned bytearray instead of bytes",
2939 encoding);
2940 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002941 Py_DECREF(v);
2942 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002943 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002944
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002945 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2946 Py_DECREF(v);
2947 return b;
2948 }
2949
2950 PyErr_Format(PyExc_TypeError,
2951 "encoder did not return a bytes object (type=%.400s)",
2952 Py_TYPE(v)->tp_name);
2953 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002954 return NULL;
2955}
2956
Alexander Belopolsky40018472011-02-26 01:02:56 +00002957PyObject *
2958PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002959 const char *encoding,
2960 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002961{
2962 PyObject *v;
2963
2964 if (!PyUnicode_Check(unicode)) {
2965 PyErr_BadArgument();
2966 goto onError;
2967 }
2968
2969 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002971
2972 /* Encode via the codec registry */
2973 v = PyCodec_Encode(unicode, encoding, errors);
2974 if (v == NULL)
2975 goto onError;
2976 if (!PyUnicode_Check(v)) {
2977 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002978 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002979 Py_TYPE(v)->tp_name);
2980 Py_DECREF(v);
2981 goto onError;
2982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 return NULL;
2987}
2988
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002989PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002990PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002991 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002992 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2993}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002994
Christian Heimes5894ba72007-11-04 11:43:14 +00002995PyObject*
2996PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2997{
Victor Stinner99b95382011-07-04 14:23:54 +02002998#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002999 return PyUnicode_DecodeMBCS(s, size, NULL);
3000#elif defined(__APPLE__)
3001 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3002#else
Victor Stinner793b5312011-04-27 00:24:21 +02003003 PyInterpreterState *interp = PyThreadState_GET()->interp;
3004 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3005 cannot use it to encode and decode filenames before it is loaded. Load
3006 the Python codec requires to encode at least its own filename. Use the C
3007 version of the locale codec until the codec registry is initialized and
3008 the Python codec is loaded.
3009
3010 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3011 cannot only rely on it: check also interp->fscodec_initialized for
3012 subinterpreters. */
3013 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003014 return PyUnicode_Decode(s, size,
3015 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003016 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003017 }
3018 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003019 /* locale encoding with surrogateescape */
3020 wchar_t *wchar;
3021 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003022 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003023
3024 if (s[size] != '\0' || size != strlen(s)) {
3025 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3026 return NULL;
3027 }
3028
Victor Stinner168e1172010-10-16 23:16:16 +00003029 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003030 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003031 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003032
Victor Stinner168e1172010-10-16 23:16:16 +00003033 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003034 PyMem_Free(wchar);
3035 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003036 }
Victor Stinnerad158722010-10-27 00:25:46 +00003037#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003038}
3039
Martin v. Löwis011e8422009-05-05 04:43:17 +00003040
3041int
3042PyUnicode_FSConverter(PyObject* arg, void* addr)
3043{
3044 PyObject *output = NULL;
3045 Py_ssize_t size;
3046 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003047 if (arg == NULL) {
3048 Py_DECREF(*(PyObject**)addr);
3049 return 1;
3050 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003051 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003052 output = arg;
3053 Py_INCREF(output);
3054 }
3055 else {
3056 arg = PyUnicode_FromObject(arg);
3057 if (!arg)
3058 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003059 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003060 Py_DECREF(arg);
3061 if (!output)
3062 return 0;
3063 if (!PyBytes_Check(output)) {
3064 Py_DECREF(output);
3065 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3066 return 0;
3067 }
3068 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003069 size = PyBytes_GET_SIZE(output);
3070 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003071 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003072 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003073 Py_DECREF(output);
3074 return 0;
3075 }
3076 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003077 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003078}
3079
3080
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003081int
3082PyUnicode_FSDecoder(PyObject* arg, void* addr)
3083{
3084 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003085 if (arg == NULL) {
3086 Py_DECREF(*(PyObject**)addr);
3087 return 1;
3088 }
3089 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090 if (PyUnicode_READY(arg))
3091 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003092 output = arg;
3093 Py_INCREF(output);
3094 }
3095 else {
3096 arg = PyBytes_FromObject(arg);
3097 if (!arg)
3098 return 0;
3099 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3100 PyBytes_GET_SIZE(arg));
3101 Py_DECREF(arg);
3102 if (!output)
3103 return 0;
3104 if (!PyUnicode_Check(output)) {
3105 Py_DECREF(output);
3106 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3107 return 0;
3108 }
3109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3111 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003112 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3113 Py_DECREF(output);
3114 return 0;
3115 }
3116 *(PyObject**)addr = output;
3117 return Py_CLEANUP_SUPPORTED;
3118}
3119
3120
Martin v. Löwis5b222132007-06-10 09:51:05 +00003121char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003122PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003123{
Christian Heimesf3863112007-11-22 07:46:41 +00003124 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003125 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3126
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003127 if (!PyUnicode_Check(unicode)) {
3128 PyErr_BadArgument();
3129 return NULL;
3130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003131 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003132 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003133
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003134 if (PyUnicode_UTF8(unicode) == NULL) {
3135 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003136 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3137 if (bytes == NULL)
3138 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003139 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3140 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 Py_DECREF(bytes);
3142 return NULL;
3143 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003144 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3145 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003146 Py_DECREF(bytes);
3147 }
3148
3149 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003150 *psize = PyUnicode_UTF8_LENGTH(unicode);
3151 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003152}
3153
3154char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003155PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3158}
3159
3160#ifdef Py_DEBUG
3161int unicode_as_unicode_calls = 0;
3162#endif
3163
3164
3165Py_UNICODE *
3166PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3167{
3168 PyUnicodeObject *u;
3169 const unsigned char *one_byte;
3170#if SIZEOF_WCHAR_T == 4
3171 const Py_UCS2 *two_bytes;
3172#else
3173 const Py_UCS4 *four_bytes;
3174 const Py_UCS4 *ucs4_end;
3175 Py_ssize_t num_surrogates;
3176#endif
3177 wchar_t *w;
3178 wchar_t *wchar_end;
3179
3180 if (!PyUnicode_Check(unicode)) {
3181 PyErr_BadArgument();
3182 return NULL;
3183 }
3184 u = (PyUnicodeObject*)unicode;
3185 if (_PyUnicode_WSTR(u) == NULL) {
3186 /* Non-ASCII compact unicode object */
3187 assert(_PyUnicode_KIND(u) != 0);
3188 assert(PyUnicode_IS_READY(u));
3189
3190#ifdef Py_DEBUG
3191 ++unicode_as_unicode_calls;
3192#endif
3193
3194 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3195#if SIZEOF_WCHAR_T == 2
3196 four_bytes = PyUnicode_4BYTE_DATA(u);
3197 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3198 num_surrogates = 0;
3199
3200 for (; four_bytes < ucs4_end; ++four_bytes) {
3201 if (*four_bytes > 0xFFFF)
3202 ++num_surrogates;
3203 }
3204
3205 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3206 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3207 if (!_PyUnicode_WSTR(u)) {
3208 PyErr_NoMemory();
3209 return NULL;
3210 }
3211 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3212
3213 w = _PyUnicode_WSTR(u);
3214 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3215 four_bytes = PyUnicode_4BYTE_DATA(u);
3216 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3217 if (*four_bytes > 0xFFFF) {
3218 /* encode surrogate pair in this case */
3219 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3220 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3221 }
3222 else
3223 *w = *four_bytes;
3224
3225 if (w > wchar_end) {
3226 assert(0 && "Miscalculated string end");
3227 }
3228 }
3229 *w = 0;
3230#else
3231 /* sizeof(wchar_t) == 4 */
3232 Py_FatalError("Impossible unicode object state, wstr and str "
3233 "should share memory already.");
3234 return NULL;
3235#endif
3236 }
3237 else {
3238 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3239 (_PyUnicode_LENGTH(u) + 1));
3240 if (!_PyUnicode_WSTR(u)) {
3241 PyErr_NoMemory();
3242 return NULL;
3243 }
3244 if (!PyUnicode_IS_COMPACT_ASCII(u))
3245 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3246 w = _PyUnicode_WSTR(u);
3247 wchar_end = w + _PyUnicode_LENGTH(u);
3248
3249 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3250 one_byte = PyUnicode_1BYTE_DATA(u);
3251 for (; w < wchar_end; ++one_byte, ++w)
3252 *w = *one_byte;
3253 /* null-terminate the wstr */
3254 *w = 0;
3255 }
3256 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3257#if SIZEOF_WCHAR_T == 4
3258 two_bytes = PyUnicode_2BYTE_DATA(u);
3259 for (; w < wchar_end; ++two_bytes, ++w)
3260 *w = *two_bytes;
3261 /* null-terminate the wstr */
3262 *w = 0;
3263#else
3264 /* sizeof(wchar_t) == 2 */
3265 PyObject_FREE(_PyUnicode_WSTR(u));
3266 _PyUnicode_WSTR(u) = NULL;
3267 Py_FatalError("Impossible unicode object state, wstr "
3268 "and str should share memory already.");
3269 return NULL;
3270#endif
3271 }
3272 else {
3273 assert(0 && "This should never happen.");
3274 }
3275 }
3276 }
3277 if (size != NULL)
3278 *size = PyUnicode_WSTR_LENGTH(u);
3279 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003280}
3281
Alexander Belopolsky40018472011-02-26 01:02:56 +00003282Py_UNICODE *
3283PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003285 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286}
3287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288
Alexander Belopolsky40018472011-02-26 01:02:56 +00003289Py_ssize_t
3290PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291{
3292 if (!PyUnicode_Check(unicode)) {
3293 PyErr_BadArgument();
3294 goto onError;
3295 }
3296 return PyUnicode_GET_SIZE(unicode);
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299 return -1;
3300}
3301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302Py_ssize_t
3303PyUnicode_GetLength(PyObject *unicode)
3304{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003305 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 PyErr_BadArgument();
3307 return -1;
3308 }
3309
3310 return PyUnicode_GET_LENGTH(unicode);
3311}
3312
3313Py_UCS4
3314PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3315{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003316 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3317 PyErr_BadArgument();
3318 return (Py_UCS4)-1;
3319 }
3320 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3321 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 return (Py_UCS4)-1;
3323 }
3324 return PyUnicode_READ_CHAR(unicode, index);
3325}
3326
3327int
3328PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3329{
3330 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003331 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003332 return -1;
3333 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003334 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3335 PyErr_SetString(PyExc_IndexError, "string index out of range");
3336 return -1;
3337 }
3338 if (_PyUnicode_Dirty(unicode))
3339 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3341 index, ch);
3342 return 0;
3343}
3344
Alexander Belopolsky40018472011-02-26 01:02:56 +00003345const char *
3346PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003347{
Victor Stinner42cb4622010-09-01 19:39:01 +00003348 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003349}
3350
Victor Stinner554f3f02010-06-16 23:33:54 +00003351/* create or adjust a UnicodeDecodeError */
3352static void
3353make_decode_exception(PyObject **exceptionObject,
3354 const char *encoding,
3355 const char *input, Py_ssize_t length,
3356 Py_ssize_t startpos, Py_ssize_t endpos,
3357 const char *reason)
3358{
3359 if (*exceptionObject == NULL) {
3360 *exceptionObject = PyUnicodeDecodeError_Create(
3361 encoding, input, length, startpos, endpos, reason);
3362 }
3363 else {
3364 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3365 goto onError;
3366 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3367 goto onError;
3368 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3369 goto onError;
3370 }
3371 return;
3372
3373onError:
3374 Py_DECREF(*exceptionObject);
3375 *exceptionObject = NULL;
3376}
3377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378/* error handling callback helper:
3379 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003380 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 and adjust various state variables.
3382 return 0 on success, -1 on error
3383*/
3384
Alexander Belopolsky40018472011-02-26 01:02:56 +00003385static int
3386unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003387 const char *encoding, const char *reason,
3388 const char **input, const char **inend, Py_ssize_t *startinpos,
3389 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3390 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003391{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003392 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003393
3394 PyObject *restuple = NULL;
3395 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003396 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003397 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003398 Py_ssize_t requiredsize;
3399 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003401 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 int res = -1;
3404
3405 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003406 *errorHandler = PyCodec_LookupError(errors);
3407 if (*errorHandler == NULL)
3408 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409 }
3410
Victor Stinner554f3f02010-06-16 23:33:54 +00003411 make_decode_exception(exceptionObject,
3412 encoding,
3413 *input, *inend - *input,
3414 *startinpos, *endinpos,
3415 reason);
3416 if (*exceptionObject == NULL)
3417 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418
3419 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3420 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003423 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003424 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 }
3426 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003428
3429 /* Copy back the bytes variables, which might have been modified by the
3430 callback */
3431 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3432 if (!inputobj)
3433 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003434 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003436 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003437 *input = PyBytes_AS_STRING(inputobj);
3438 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003439 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003440 /* we can DECREF safely, as the exception has another reference,
3441 so the object won't go away. */
3442 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003444 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003445 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003446 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003447 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3448 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003449 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003450
3451 /* need more space? (at least enough for what we
3452 have+the replacement+the rest of the string (starting
3453 at the new input position), so we won't have to check space
3454 when there are no errors in the rest of the string) */
3455 repptr = PyUnicode_AS_UNICODE(repunicode);
3456 repsize = PyUnicode_GET_SIZE(repunicode);
3457 requiredsize = *outpos + repsize + insize-newpos;
3458 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003459 if (requiredsize<2*outsize)
3460 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003461 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 goto onError;
3463 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003464 }
3465 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003466 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003467 Py_UNICODE_COPY(*outptr, repptr, repsize);
3468 *outptr += repsize;
3469 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471 /* we made it! */
3472 res = 0;
3473
Benjamin Peterson29060642009-01-31 22:14:21 +00003474 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 Py_XDECREF(restuple);
3476 return res;
3477}
3478
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003479/* --- UTF-7 Codec -------------------------------------------------------- */
3480
Antoine Pitrou244651a2009-05-04 18:56:13 +00003481/* See RFC2152 for details. We encode conservatively and decode liberally. */
3482
3483/* Three simple macros defining base-64. */
3484
3485/* Is c a base-64 character? */
3486
3487#define IS_BASE64(c) \
3488 (((c) >= 'A' && (c) <= 'Z') || \
3489 ((c) >= 'a' && (c) <= 'z') || \
3490 ((c) >= '0' && (c) <= '9') || \
3491 (c) == '+' || (c) == '/')
3492
3493/* given that c is a base-64 character, what is its base-64 value? */
3494
3495#define FROM_BASE64(c) \
3496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3499 (c) == '+' ? 62 : 63)
3500
3501/* What is the base-64 character of the bottom 6 bits of n? */
3502
3503#define TO_BASE64(n) \
3504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3505
3506/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3507 * decoded as itself. We are permissive on decoding; the only ASCII
3508 * byte not decoding to itself is the + which begins a base64
3509 * string. */
3510
3511#define DECODE_DIRECT(c) \
3512 ((c) <= 127 && (c) != '+')
3513
3514/* The UTF-7 encoder treats ASCII characters differently according to
3515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3516 * the above). See RFC2152. This array identifies these different
3517 * sets:
3518 * 0 : "Set D"
3519 * alphanumeric and '(),-./:?
3520 * 1 : "Set O"
3521 * !"#$%&*;<=>@[]^_`{|}
3522 * 2 : "whitespace"
3523 * ht nl cr sp
3524 * 3 : special (must be base64 encoded)
3525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3526 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003527
Tim Petersced69f82003-09-16 20:30:58 +00003528static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003529char utf7_category[128] = {
3530/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3532/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3534/* sp ! " # $ % & ' ( ) * + , - . / */
3535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3536/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3538/* @ A B C D E F G H I J K L M N O */
3539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3540/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3542/* ` a b c d e f g h i j k l m n o */
3543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3544/* p q r s t u v w x y z { | } ~ del */
3545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003546};
3547
Antoine Pitrou244651a2009-05-04 18:56:13 +00003548/* ENCODE_DIRECT: this character should be encoded as itself. The
3549 * answer depends on whether we are encoding set O as itself, and also
3550 * on whether we are encoding whitespace as itself. RFC2152 makes it
3551 * clear that the answers to these questions vary between
3552 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003553
Antoine Pitrou244651a2009-05-04 18:56:13 +00003554#define ENCODE_DIRECT(c, directO, directWS) \
3555 ((c) < 128 && (c) > 0 && \
3556 ((utf7_category[(c)] == 0) || \
3557 (directWS && (utf7_category[(c)] == 2)) || \
3558 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003559
Alexander Belopolsky40018472011-02-26 01:02:56 +00003560PyObject *
3561PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003562 Py_ssize_t size,
3563 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003564{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003565 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3566}
3567
Antoine Pitrou244651a2009-05-04 18:56:13 +00003568/* The decoder. The only state we preserve is our read position,
3569 * i.e. how many characters we have consumed. So if we end in the
3570 * middle of a shift sequence we have to back off the read position
3571 * and the output to the beginning of the sequence, otherwise we lose
3572 * all the shift state (seen bits, number of bits seen, high
3573 * surrogate). */
3574
Alexander Belopolsky40018472011-02-26 01:02:56 +00003575PyObject *
3576PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003577 Py_ssize_t size,
3578 const char *errors,
3579 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003580{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003582 Py_ssize_t startinpos;
3583 Py_ssize_t endinpos;
3584 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 const char *e;
3586 PyUnicodeObject *unicode;
3587 Py_UNICODE *p;
3588 const char *errmsg = "";
3589 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003590 Py_UNICODE *shiftOutStart;
3591 unsigned int base64bits = 0;
3592 unsigned long base64buffer = 0;
3593 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594 PyObject *errorHandler = NULL;
3595 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596
3597 unicode = _PyUnicode_New(size);
3598 if (!unicode)
3599 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003600 if (size == 0) {
3601 if (consumed)
3602 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003603 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003604 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003608 e = s + size;
3609
3610 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003613 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 if (inShift) { /* in a base-64 section */
3616 if (IS_BASE64(ch)) { /* consume a base-64 character */
3617 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3618 base64bits += 6;
3619 s++;
3620 if (base64bits >= 16) {
3621 /* we have enough bits for a UTF-16 value */
3622 Py_UNICODE outCh = (Py_UNICODE)
3623 (base64buffer >> (base64bits-16));
3624 base64bits -= 16;
3625 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3626 if (surrogate) {
3627 /* expecting a second surrogate */
3628 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3629#ifdef Py_UNICODE_WIDE
3630 *p++ = (((surrogate & 0x3FF)<<10)
3631 | (outCh & 0x3FF)) + 0x10000;
3632#else
3633 *p++ = surrogate;
3634 *p++ = outCh;
3635#endif
3636 surrogate = 0;
3637 }
3638 else {
3639 surrogate = 0;
3640 errmsg = "second surrogate missing";
3641 goto utf7Error;
3642 }
3643 }
3644 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3645 /* first surrogate */
3646 surrogate = outCh;
3647 }
3648 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3649 errmsg = "unexpected second surrogate";
3650 goto utf7Error;
3651 }
3652 else {
3653 *p++ = outCh;
3654 }
3655 }
3656 }
3657 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003658 inShift = 0;
3659 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003660 if (surrogate) {
3661 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003662 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003663 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003664 if (base64bits > 0) { /* left-over bits */
3665 if (base64bits >= 6) {
3666 /* We've seen at least one base-64 character */
3667 errmsg = "partial character in shift sequence";
3668 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003669 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003670 else {
3671 /* Some bits remain; they should be zero */
3672 if (base64buffer != 0) {
3673 errmsg = "non-zero padding bits in shift sequence";
3674 goto utf7Error;
3675 }
3676 }
3677 }
3678 if (ch != '-') {
3679 /* '-' is absorbed; other terminating
3680 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003681 *p++ = ch;
3682 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003683 }
3684 }
3685 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003687 s++; /* consume '+' */
3688 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 s++;
3690 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003691 }
3692 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003693 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003694 shiftOutStart = p;
3695 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003696 }
3697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003698 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699 *p++ = ch;
3700 s++;
3701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003702 else {
3703 startinpos = s-starts;
3704 s++;
3705 errmsg = "unexpected special character";
3706 goto utf7Error;
3707 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003708 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003709utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 outpos = p-PyUnicode_AS_UNICODE(unicode);
3711 endinpos = s-starts;
3712 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 errors, &errorHandler,
3714 "utf7", errmsg,
3715 &starts, &e, &startinpos, &endinpos, &exc, &s,
3716 &unicode, &outpos, &p))
3717 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003718 }
3719
Antoine Pitrou244651a2009-05-04 18:56:13 +00003720 /* end of string */
3721
3722 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3723 /* if we're in an inconsistent state, that's an error */
3724 if (surrogate ||
3725 (base64bits >= 6) ||
3726 (base64bits > 0 && base64buffer != 0)) {
3727 outpos = p-PyUnicode_AS_UNICODE(unicode);
3728 endinpos = size;
3729 if (unicode_decode_call_errorhandler(
3730 errors, &errorHandler,
3731 "utf7", "unterminated shift sequence",
3732 &starts, &e, &startinpos, &endinpos, &exc, &s,
3733 &unicode, &outpos, &p))
3734 goto onError;
3735 if (s < e)
3736 goto restart;
3737 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003739
3740 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003741 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003742 if (inShift) {
3743 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003744 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003745 }
3746 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003747 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003748 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003749 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003750
Victor Stinnerfe226c02011-10-03 03:52:20 +02003751 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003752 goto onError;
3753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_XDECREF(errorHandler);
3755 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003756#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003757 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758 Py_DECREF(unicode);
3759 return NULL;
3760 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003761#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003762 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003763 return (PyObject *)unicode;
3764
Benjamin Peterson29060642009-01-31 22:14:21 +00003765 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003766 Py_XDECREF(errorHandler);
3767 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768 Py_DECREF(unicode);
3769 return NULL;
3770}
3771
3772
Alexander Belopolsky40018472011-02-26 01:02:56 +00003773PyObject *
3774PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003775 Py_ssize_t size,
3776 int base64SetO,
3777 int base64WhiteSpace,
3778 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003780 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003781 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003782 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003783 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003784 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003785 unsigned int base64bits = 0;
3786 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787 char * out;
3788 char * start;
3789
3790 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003791 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003792
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003793 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003794 return PyErr_NoMemory();
3795
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003797 if (v == NULL)
3798 return NULL;
3799
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003800 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003801 for (;i < size; ++i) {
3802 Py_UNICODE ch = s[i];
3803
Antoine Pitrou244651a2009-05-04 18:56:13 +00003804 if (inShift) {
3805 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3806 /* shifting out */
3807 if (base64bits) { /* output remaining bits */
3808 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3809 base64buffer = 0;
3810 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811 }
3812 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003813 /* Characters not in the BASE64 set implicitly unshift the sequence
3814 so no '-' is required, except if the character is itself a '-' */
3815 if (IS_BASE64(ch) || ch == '-') {
3816 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818 *out++ = (char) ch;
3819 }
3820 else {
3821 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003822 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003823 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003824 else { /* not in a shift sequence */
3825 if (ch == '+') {
3826 *out++ = '+';
3827 *out++ = '-';
3828 }
3829 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3830 *out++ = (char) ch;
3831 }
3832 else {
3833 *out++ = '+';
3834 inShift = 1;
3835 goto encode_char;
3836 }
3837 }
3838 continue;
3839encode_char:
3840#ifdef Py_UNICODE_WIDE
3841 if (ch >= 0x10000) {
3842 /* code first surrogate */
3843 base64bits += 16;
3844 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3845 while (base64bits >= 6) {
3846 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3847 base64bits -= 6;
3848 }
3849 /* prepare second surrogate */
3850 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3851 }
3852#endif
3853 base64bits += 16;
3854 base64buffer = (base64buffer << 16) | ch;
3855 while (base64bits >= 6) {
3856 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3857 base64bits -= 6;
3858 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003859 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003860 if (base64bits)
3861 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3862 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003863 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003864 if (_PyBytes_Resize(&v, out - start) < 0)
3865 return NULL;
3866 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867}
3868
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869#undef IS_BASE64
3870#undef FROM_BASE64
3871#undef TO_BASE64
3872#undef DECODE_DIRECT
3873#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875/* --- UTF-8 Codec -------------------------------------------------------- */
3876
Tim Petersced69f82003-09-16 20:30:58 +00003877static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003879 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3880 illegal prefix. See RFC 3629 for details */
3881 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3882 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003883 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3885 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3886 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3887 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003888 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3889 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3893 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3894 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3895 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3896 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003897};
3898
Alexander Belopolsky40018472011-02-26 01:02:56 +00003899PyObject *
3900PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003901 Py_ssize_t size,
3902 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903{
Walter Dörwald69652032004-09-07 20:24:22 +00003904 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3905}
3906
Antoine Pitrouab868312009-01-10 15:40:25 +00003907/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3908#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3909
3910/* Mask to quickly check whether a C 'long' contains a
3911 non-ASCII, UTF8-encoded char. */
3912#if (SIZEOF_LONG == 8)
3913# define ASCII_CHAR_MASK 0x8080808080808080L
3914#elif (SIZEOF_LONG == 4)
3915# define ASCII_CHAR_MASK 0x80808080L
3916#else
3917# error C 'long' size should be either 4 or 8!
3918#endif
3919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920/* Scans a UTF-8 string and returns the maximum character to be expected,
3921 the size of the decoded unicode string and if any major errors were
3922 encountered.
3923
3924 This function does check basic UTF-8 sanity, it does however NOT CHECK
3925 if the string contains surrogates, and if all continuation bytes are
3926 within the correct ranges, these checks are performed in
3927 PyUnicode_DecodeUTF8Stateful.
3928
3929 If it sets has_errors to 1, it means the value of unicode_size and max_char
3930 will be bogus and you should not rely on useful information in them.
3931 */
3932static Py_UCS4
3933utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3934 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3935 int *has_errors)
3936{
3937 Py_ssize_t n;
3938 Py_ssize_t char_count = 0;
3939 Py_UCS4 max_char = 127, new_max;
3940 Py_UCS4 upper_bound;
3941 const unsigned char *p = (const unsigned char *)s;
3942 const unsigned char *end = p + string_size;
3943 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3944 int err = 0;
3945
3946 for (; p < end && !err; ++p, ++char_count) {
3947 /* Only check value if it's not a ASCII char... */
3948 if (*p < 0x80) {
3949 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3950 an explanation. */
3951 if (!((size_t) p & LONG_PTR_MASK)) {
3952 /* Help register allocation */
3953 register const unsigned char *_p = p;
3954 while (_p < aligned_end) {
3955 unsigned long value = *(unsigned long *) _p;
3956 if (value & ASCII_CHAR_MASK)
3957 break;
3958 _p += SIZEOF_LONG;
3959 char_count += SIZEOF_LONG;
3960 }
3961 p = _p;
3962 if (p == end)
3963 break;
3964 }
3965 }
3966 if (*p >= 0x80) {
3967 n = utf8_code_length[*p];
3968 new_max = max_char;
3969 switch (n) {
3970 /* invalid start byte */
3971 case 0:
3972 err = 1;
3973 break;
3974 case 2:
3975 /* Code points between 0x00FF and 0x07FF inclusive.
3976 Approximate the upper bound of the code point,
3977 if this flips over 255 we can be sure it will be more
3978 than 255 and the string will need 2 bytes per code coint,
3979 if it stays under or equal to 255, we can be sure 1 byte
3980 is enough.
3981 ((*p & 0b00011111) << 6) | 0b00111111 */
3982 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3983 if (max_char < upper_bound)
3984 new_max = upper_bound;
3985 /* Ensure we track at least that we left ASCII space. */
3986 if (new_max < 128)
3987 new_max = 128;
3988 break;
3989 case 3:
3990 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3991 always > 255 and <= 65535 and will always need 2 bytes. */
3992 if (max_char < 65535)
3993 new_max = 65535;
3994 break;
3995 case 4:
3996 /* Code point will be above 0xFFFF for sure in this case. */
3997 new_max = 65537;
3998 break;
3999 /* Internal error, this should be caught by the first if */
4000 case 1:
4001 default:
4002 assert(0 && "Impossible case in utf8_max_char_and_size");
4003 err = 1;
4004 }
4005 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004006 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 --n;
4008 /* Check if the follow up chars are all valid continuation bytes */
4009 if (n >= 1) {
4010 const unsigned char *cont;
4011 if ((p + n) >= end) {
4012 if (consumed == 0)
4013 /* incomplete data, non-incremental decoding */
4014 err = 1;
4015 break;
4016 }
4017 for (cont = p + 1; cont < (p + n); ++cont) {
4018 if ((*cont & 0xc0) != 0x80) {
4019 err = 1;
4020 break;
4021 }
4022 }
4023 p += n;
4024 }
4025 else
4026 err = 1;
4027 max_char = new_max;
4028 }
4029 }
4030
4031 if (unicode_size)
4032 *unicode_size = char_count;
4033 if (has_errors)
4034 *has_errors = err;
4035 return max_char;
4036}
4037
4038/* Similar to PyUnicode_WRITE but can also write into wstr field
4039 of the legacy unicode representation */
4040#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4041 do { \
4042 const int k_ = (kind); \
4043 if (k_ == PyUnicode_WCHAR_KIND) \
4044 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4045 else if (k_ == PyUnicode_1BYTE_KIND) \
4046 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4047 else if (k_ == PyUnicode_2BYTE_KIND) \
4048 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4049 else \
4050 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4051 } while (0)
4052
Alexander Belopolsky40018472011-02-26 01:02:56 +00004053PyObject *
4054PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 Py_ssize_t size,
4056 const char *errors,
4057 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004058{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004059 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004061 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004062 Py_ssize_t startinpos;
4063 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004064 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004066 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 PyObject *errorHandler = NULL;
4068 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 Py_UCS4 maxchar = 0;
4070 Py_ssize_t unicode_size;
4071 Py_ssize_t i;
4072 int kind;
4073 void *data;
4074 int has_errors;
4075 Py_UNICODE *error_outptr;
4076#if SIZEOF_WCHAR_T == 2
4077 Py_ssize_t wchar_offset = 0;
4078#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079
Walter Dörwald69652032004-09-07 20:24:22 +00004080 if (size == 0) {
4081 if (consumed)
4082 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4086 consumed, &has_errors);
4087 if (has_errors) {
4088 unicode = _PyUnicode_New(size);
4089 if (!unicode)
4090 return NULL;
4091 kind = PyUnicode_WCHAR_KIND;
4092 data = PyUnicode_AS_UNICODE(unicode);
4093 assert(data != NULL);
4094 }
4095 else {
4096 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4097 if (!unicode)
4098 return NULL;
4099 /* When the string is ASCII only, just use memcpy and return.
4100 unicode_size may be != size if there is an incomplete UTF-8
4101 sequence at the end of the ASCII block. */
4102 if (maxchar < 128 && size == unicode_size) {
4103 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4104 return (PyObject *)unicode;
4105 }
4106 kind = PyUnicode_KIND(unicode);
4107 data = PyUnicode_DATA(unicode);
4108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004110 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004112 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
4114 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004115 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116
4117 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004118 /* Fast path for runs of ASCII characters. Given that common UTF-8
4119 input will consist of an overwhelming majority of ASCII
4120 characters, we try to optimize for this case by checking
4121 as many characters as a C 'long' can contain.
4122 First, check if we can do an aligned read, as most CPUs have
4123 a penalty for unaligned reads.
4124 */
4125 if (!((size_t) s & LONG_PTR_MASK)) {
4126 /* Help register allocation */
4127 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004129 while (_s < aligned_end) {
4130 /* Read a whole long at a time (either 4 or 8 bytes),
4131 and do a fast unrolled copy if it only contains ASCII
4132 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004133 unsigned long value = *(unsigned long *) _s;
4134 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004135 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4137 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4138 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4139 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004140#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4142 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4143 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4144 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004145#endif
4146 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004148 }
4149 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004151 if (s == e)
4152 break;
4153 ch = (unsigned char)*s;
4154 }
4155 }
4156
4157 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159 s++;
4160 continue;
4161 }
4162
4163 n = utf8_code_length[ch];
4164
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004165 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 if (consumed)
4167 break;
4168 else {
4169 errmsg = "unexpected end of data";
4170 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004171 endinpos = startinpos+1;
4172 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4173 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 goto utf8Error;
4175 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
4178 switch (n) {
4179
4180 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004181 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 startinpos = s-starts;
4183 endinpos = startinpos+1;
4184 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
4186 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004187 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 startinpos = s-starts;
4189 endinpos = startinpos+1;
4190 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191
4192 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004193 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004194 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004196 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 goto utf8Error;
4198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004200 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004201 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 break;
4203
4204 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004205 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4206 will result in surrogates in range d800-dfff. Surrogates are
4207 not valid UTF-8 so they are rejected.
4208 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4209 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004210 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004211 (s[2] & 0xc0) != 0x80 ||
4212 ((unsigned char)s[0] == 0xE0 &&
4213 (unsigned char)s[1] < 0xA0) ||
4214 ((unsigned char)s[0] == 0xED &&
4215 (unsigned char)s[1] > 0x9F)) {
4216 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004218 endinpos = startinpos + 1;
4219
4220 /* if s[1] first two bits are 1 and 0, then the invalid
4221 continuation byte is s[2], so increment endinpos by 1,
4222 if not, s[1] is invalid and endinpos doesn't need to
4223 be incremented. */
4224 if ((s[1] & 0xC0) == 0x80)
4225 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 goto utf8Error;
4227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004229 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004231 break;
4232
4233 case 4:
4234 if ((s[1] & 0xc0) != 0x80 ||
4235 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004236 (s[3] & 0xc0) != 0x80 ||
4237 ((unsigned char)s[0] == 0xF0 &&
4238 (unsigned char)s[1] < 0x90) ||
4239 ((unsigned char)s[0] == 0xF4 &&
4240 (unsigned char)s[1] > 0x8F)) {
4241 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004243 endinpos = startinpos + 1;
4244 if ((s[1] & 0xC0) == 0x80) {
4245 endinpos++;
4246 if ((s[2] & 0xC0) == 0x80)
4247 endinpos++;
4248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto utf8Error;
4250 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004251 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004252 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4253 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004255 /* If the string is flexible or we have native UCS-4, write
4256 directly.. */
4257 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4258 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 else {
4261 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* translate from 10000..10FFFF to 0..FFFF */
4264 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266 /* high surrogate = top 10 bits added to D800 */
4267 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4268 (Py_UNICODE)(0xD800 + (ch >> 10)));
4269
4270 /* low surrogate = bottom 10 bits added to DC00 */
4271 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4272 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4273 }
4274#if SIZEOF_WCHAR_T == 2
4275 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004276#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 }
4279 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004280 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004281
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004283 /* If this is not yet a resizable string, make it one.. */
4284 if (kind != PyUnicode_WCHAR_KIND) {
4285 const Py_UNICODE *u;
4286 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4287 if (!new_unicode)
4288 goto onError;
4289 u = PyUnicode_AsUnicode((PyObject *)unicode);
4290 if (!u)
4291 goto onError;
4292#if SIZEOF_WCHAR_T == 2
4293 i += wchar_offset;
4294#endif
4295 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4296 Py_DECREF(unicode);
4297 unicode = new_unicode;
4298 kind = 0;
4299 data = PyUnicode_AS_UNICODE(new_unicode);
4300 assert(data != NULL);
4301 }
4302 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 if (unicode_decode_call_errorhandler(
4304 errors, &errorHandler,
4305 "utf8", errmsg,
4306 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004307 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004309 /* Update data because unicode_decode_call_errorhandler might have
4310 re-created or resized the unicode object. */
4311 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004314 /* Ensure the unicode_size calculation above was correct: */
4315 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4316
Walter Dörwald69652032004-09-07 20:24:22 +00004317 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004320 /* Adjust length and ready string when it contained errors and
4321 is of the old resizable kind. */
4322 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004323 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004324 goto onError;
4325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_XDECREF(errorHandler);
4328 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004329#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004330 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 Py_DECREF(unicode);
4332 return NULL;
4333 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004334#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004335 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 return (PyObject *)unicode;
4337
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004339 Py_XDECREF(errorHandler);
4340 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 Py_DECREF(unicode);
4342 return NULL;
4343}
4344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004346
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004347#ifdef __APPLE__
4348
4349/* Simplified UTF-8 decoder using surrogateescape error handler,
4350 used to decode the command line arguments on Mac OS X. */
4351
4352wchar_t*
4353_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4354{
4355 int n;
4356 const char *e;
4357 wchar_t *unicode, *p;
4358
4359 /* Note: size will always be longer than the resulting Unicode
4360 character count */
4361 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4362 PyErr_NoMemory();
4363 return NULL;
4364 }
4365 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4366 if (!unicode)
4367 return NULL;
4368
4369 /* Unpack UTF-8 encoded data */
4370 p = unicode;
4371 e = s + size;
4372 while (s < e) {
4373 Py_UCS4 ch = (unsigned char)*s;
4374
4375 if (ch < 0x80) {
4376 *p++ = (wchar_t)ch;
4377 s++;
4378 continue;
4379 }
4380
4381 n = utf8_code_length[ch];
4382 if (s + n > e) {
4383 goto surrogateescape;
4384 }
4385
4386 switch (n) {
4387 case 0:
4388 case 1:
4389 goto surrogateescape;
4390
4391 case 2:
4392 if ((s[1] & 0xc0) != 0x80)
4393 goto surrogateescape;
4394 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4395 assert ((ch > 0x007F) && (ch <= 0x07FF));
4396 *p++ = (wchar_t)ch;
4397 break;
4398
4399 case 3:
4400 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4401 will result in surrogates in range d800-dfff. Surrogates are
4402 not valid UTF-8 so they are rejected.
4403 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4404 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4405 if ((s[1] & 0xc0) != 0x80 ||
4406 (s[2] & 0xc0) != 0x80 ||
4407 ((unsigned char)s[0] == 0xE0 &&
4408 (unsigned char)s[1] < 0xA0) ||
4409 ((unsigned char)s[0] == 0xED &&
4410 (unsigned char)s[1] > 0x9F)) {
4411
4412 goto surrogateescape;
4413 }
4414 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4415 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004417 break;
4418
4419 case 4:
4420 if ((s[1] & 0xc0) != 0x80 ||
4421 (s[2] & 0xc0) != 0x80 ||
4422 (s[3] & 0xc0) != 0x80 ||
4423 ((unsigned char)s[0] == 0xF0 &&
4424 (unsigned char)s[1] < 0x90) ||
4425 ((unsigned char)s[0] == 0xF4 &&
4426 (unsigned char)s[1] > 0x8F)) {
4427 goto surrogateescape;
4428 }
4429 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4430 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4431 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4432
4433#if SIZEOF_WCHAR_T == 4
4434 *p++ = (wchar_t)ch;
4435#else
4436 /* compute and append the two surrogates: */
4437
4438 /* translate from 10000..10FFFF to 0..FFFF */
4439 ch -= 0x10000;
4440
4441 /* high surrogate = top 10 bits added to D800 */
4442 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4443
4444 /* low surrogate = bottom 10 bits added to DC00 */
4445 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4446#endif
4447 break;
4448 }
4449 s += n;
4450 continue;
4451
4452 surrogateescape:
4453 *p++ = 0xDC00 + ch;
4454 s++;
4455 }
4456 *p = L'\0';
4457 return unicode;
4458}
4459
4460#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462/* Primary internal function which creates utf8 encoded bytes objects.
4463
4464 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004465 and allocate exactly as much space needed at the end. Else allocate the
4466 maximum possible needed (4 result bytes per Unicode character), and return
4467 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004468*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004469PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471{
Tim Peters602f7402002-04-27 18:03:26 +00004472#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004473
Guido van Rossum98297ee2007-11-06 21:34:58 +00004474 Py_ssize_t i; /* index into s of next input byte */
4475 PyObject *result; /* result string object */
4476 char *p; /* next free byte in output buffer */
4477 Py_ssize_t nallocated; /* number of result bytes allocated */
4478 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004479 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004480 PyObject *errorHandler = NULL;
4481 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 int kind;
4483 void *data;
4484 Py_ssize_t size;
4485 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4486#if SIZEOF_WCHAR_T == 2
4487 Py_ssize_t wchar_offset = 0;
4488#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004490 if (!PyUnicode_Check(unicode)) {
4491 PyErr_BadArgument();
4492 return NULL;
4493 }
4494
4495 if (PyUnicode_READY(unicode) == -1)
4496 return NULL;
4497
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004498 if (PyUnicode_UTF8(unicode))
4499 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4500 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004501
4502 kind = PyUnicode_KIND(unicode);
4503 data = PyUnicode_DATA(unicode);
4504 size = PyUnicode_GET_LENGTH(unicode);
4505
Tim Peters602f7402002-04-27 18:03:26 +00004506 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507
Tim Peters602f7402002-04-27 18:03:26 +00004508 if (size <= MAX_SHORT_UNICHARS) {
4509 /* Write into the stack buffer; nallocated can't overflow.
4510 * At the end, we'll allocate exactly as much heap space as it
4511 * turns out we need.
4512 */
4513 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004514 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004515 p = stackbuf;
4516 }
4517 else {
4518 /* Overallocate on the heap, and give the excess back at the end. */
4519 nallocated = size * 4;
4520 if (nallocated / 4 != size) /* overflow! */
4521 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004522 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004523 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004524 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004525 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004526 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004527
Tim Peters602f7402002-04-27 18:03:26 +00004528 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004529 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004530
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004531 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004532 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004534
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004536 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004537 *p++ = (char)(0xc0 | (ch >> 6));
4538 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004539 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004540 Py_ssize_t newpos;
4541 PyObject *rep;
4542 Py_ssize_t repsize, k, startpos;
4543 startpos = i-1;
4544#if SIZEOF_WCHAR_T == 2
4545 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004546#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004547 rep = unicode_encode_call_errorhandler(
4548 errors, &errorHandler, "utf-8", "surrogates not allowed",
4549 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4550 &exc, startpos, startpos+1, &newpos);
4551 if (!rep)
4552 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004554 if (PyBytes_Check(rep))
4555 repsize = PyBytes_GET_SIZE(rep);
4556 else
4557 repsize = PyUnicode_GET_SIZE(rep);
4558
4559 if (repsize > 4) {
4560 Py_ssize_t offset;
4561
4562 if (result == NULL)
4563 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004564 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004565 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004567 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4568 /* integer overflow */
4569 PyErr_NoMemory();
4570 goto error;
4571 }
4572 nallocated += repsize - 4;
4573 if (result != NULL) {
4574 if (_PyBytes_Resize(&result, nallocated) < 0)
4575 goto error;
4576 } else {
4577 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004578 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004579 goto error;
4580 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4581 }
4582 p = PyBytes_AS_STRING(result) + offset;
4583 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004585 if (PyBytes_Check(rep)) {
4586 char *prep = PyBytes_AS_STRING(rep);
4587 for(k = repsize; k > 0; k--)
4588 *p++ = *prep++;
4589 } else /* rep is unicode */ {
4590 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4591 Py_UNICODE c;
4592
4593 for(k=0; k<repsize; k++) {
4594 c = prep[k];
4595 if (0x80 <= c) {
4596 raise_encode_exception(&exc, "utf-8",
4597 PyUnicode_AS_UNICODE(unicode),
4598 size, i-1, i,
4599 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004600 goto error;
4601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004602 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004603 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004605 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004606 } else if (ch < 0x10000) {
4607 *p++ = (char)(0xe0 | (ch >> 12));
4608 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4609 *p++ = (char)(0x80 | (ch & 0x3f));
4610 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004611 /* Encode UCS4 Unicode ordinals */
4612 *p++ = (char)(0xf0 | (ch >> 18));
4613 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4614 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4615 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616#if SIZEOF_WCHAR_T == 2
4617 wchar_offset++;
4618#endif
Tim Peters602f7402002-04-27 18:03:26 +00004619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004621
Guido van Rossum98297ee2007-11-06 21:34:58 +00004622 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004623 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004624 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004625 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004626 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004627 }
4628 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004629 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004630 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004631 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004632 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004634
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004635 Py_XDECREF(errorHandler);
4636 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004637 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004638 error:
4639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
4641 Py_XDECREF(result);
4642 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004643
Tim Peters602f7402002-04-27 18:03:26 +00004644#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645}
4646
Alexander Belopolsky40018472011-02-26 01:02:56 +00004647PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004648PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4649 Py_ssize_t size,
4650 const char *errors)
4651{
4652 PyObject *v, *unicode;
4653
4654 unicode = PyUnicode_FromUnicode(s, size);
4655 if (unicode == NULL)
4656 return NULL;
4657 v = _PyUnicode_AsUTF8String(unicode, errors);
4658 Py_DECREF(unicode);
4659 return v;
4660}
4661
4662PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004663PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666}
4667
Walter Dörwald41980ca2007-08-16 21:55:45 +00004668/* --- UTF-32 Codec ------------------------------------------------------- */
4669
4670PyObject *
4671PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 Py_ssize_t size,
4673 const char *errors,
4674 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004675{
4676 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4677}
4678
4679PyObject *
4680PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 Py_ssize_t size,
4682 const char *errors,
4683 int *byteorder,
4684 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004685{
4686 const char *starts = s;
4687 Py_ssize_t startinpos;
4688 Py_ssize_t endinpos;
4689 Py_ssize_t outpos;
4690 PyUnicodeObject *unicode;
4691 Py_UNICODE *p;
4692#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004693 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004694 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004695#else
4696 const int pairs = 0;
4697#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004698 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004699 int bo = 0; /* assume native ordering by default */
4700 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004701 /* Offsets from q for retrieving bytes in the right order. */
4702#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4703 int iorder[] = {0, 1, 2, 3};
4704#else
4705 int iorder[] = {3, 2, 1, 0};
4706#endif
4707 PyObject *errorHandler = NULL;
4708 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004709
Walter Dörwald41980ca2007-08-16 21:55:45 +00004710 q = (unsigned char *)s;
4711 e = q + size;
4712
4713 if (byteorder)
4714 bo = *byteorder;
4715
4716 /* Check for BOM marks (U+FEFF) in the input and adjust current
4717 byte order setting accordingly. In native mode, the leading BOM
4718 mark is skipped, in all other modes, it is copied to the output
4719 stream as-is (giving a ZWNBSP character). */
4720 if (bo == 0) {
4721 if (size >= 4) {
4722 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004724#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 if (bom == 0x0000FEFF) {
4726 q += 4;
4727 bo = -1;
4728 }
4729 else if (bom == 0xFFFE0000) {
4730 q += 4;
4731 bo = 1;
4732 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004733#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004734 if (bom == 0x0000FEFF) {
4735 q += 4;
4736 bo = 1;
4737 }
4738 else if (bom == 0xFFFE0000) {
4739 q += 4;
4740 bo = -1;
4741 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004742#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744 }
4745
4746 if (bo == -1) {
4747 /* force LE */
4748 iorder[0] = 0;
4749 iorder[1] = 1;
4750 iorder[2] = 2;
4751 iorder[3] = 3;
4752 }
4753 else if (bo == 1) {
4754 /* force BE */
4755 iorder[0] = 3;
4756 iorder[1] = 2;
4757 iorder[2] = 1;
4758 iorder[3] = 0;
4759 }
4760
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004761 /* On narrow builds we split characters outside the BMP into two
4762 codepoints => count how much extra space we need. */
4763#ifndef Py_UNICODE_WIDE
4764 for (qq = q; qq < e; qq += 4)
4765 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4766 pairs++;
4767#endif
4768
4769 /* This might be one to much, because of a BOM */
4770 unicode = _PyUnicode_New((size+3)/4+pairs);
4771 if (!unicode)
4772 return NULL;
4773 if (size == 0)
4774 return (PyObject *)unicode;
4775
4776 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004778
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004780 Py_UCS4 ch;
4781 /* remaining bytes at the end? (size should be divisible by 4) */
4782 if (e-q<4) {
4783 if (consumed)
4784 break;
4785 errmsg = "truncated data";
4786 startinpos = ((const char *)q)-starts;
4787 endinpos = ((const char *)e)-starts;
4788 goto utf32Error;
4789 /* The remaining input chars are ignored if the callback
4790 chooses to skip the input */
4791 }
4792 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4793 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794
Benjamin Peterson29060642009-01-31 22:14:21 +00004795 if (ch >= 0x110000)
4796 {
4797 errmsg = "codepoint not in range(0x110000)";
4798 startinpos = ((const char *)q)-starts;
4799 endinpos = startinpos+4;
4800 goto utf32Error;
4801 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (ch >= 0x10000)
4804 {
4805 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4806 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4807 }
4808 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004809#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 *p++ = ch;
4811 q += 4;
4812 continue;
4813 utf32Error:
4814 outpos = p-PyUnicode_AS_UNICODE(unicode);
4815 if (unicode_decode_call_errorhandler(
4816 errors, &errorHandler,
4817 "utf32", errmsg,
4818 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4819 &unicode, &outpos, &p))
4820 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004821 }
4822
4823 if (byteorder)
4824 *byteorder = bo;
4825
4826 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004827 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004828
4829 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004830 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004831 goto onError;
4832
4833 Py_XDECREF(errorHandler);
4834 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004835#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004836 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004837 Py_DECREF(unicode);
4838 return NULL;
4839 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004840#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004841 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004842 return (PyObject *)unicode;
4843
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004845 Py_DECREF(unicode);
4846 Py_XDECREF(errorHandler);
4847 Py_XDECREF(exc);
4848 return NULL;
4849}
4850
4851PyObject *
4852PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 Py_ssize_t size,
4854 const char *errors,
4855 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004856{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004857 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004859 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004860#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004861 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004862#else
4863 const int pairs = 0;
4864#endif
4865 /* Offsets from p for storing byte pairs in the right order. */
4866#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4867 int iorder[] = {0, 1, 2, 3};
4868#else
4869 int iorder[] = {3, 2, 1, 0};
4870#endif
4871
Benjamin Peterson29060642009-01-31 22:14:21 +00004872#define STORECHAR(CH) \
4873 do { \
4874 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4875 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4876 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4877 p[iorder[0]] = (CH) & 0xff; \
4878 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879 } while(0)
4880
4881 /* In narrow builds we can output surrogate pairs as one codepoint,
4882 so we need less space. */
4883#ifndef Py_UNICODE_WIDE
4884 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4886 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4887 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004888#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004889 nsize = (size - pairs + (byteorder == 0));
4890 bytesize = nsize * 4;
4891 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004893 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004894 if (v == NULL)
4895 return NULL;
4896
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004897 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004898 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004901 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902
4903 if (byteorder == -1) {
4904 /* force LE */
4905 iorder[0] = 0;
4906 iorder[1] = 1;
4907 iorder[2] = 2;
4908 iorder[3] = 3;
4909 }
4910 else if (byteorder == 1) {
4911 /* force BE */
4912 iorder[0] = 3;
4913 iorder[1] = 2;
4914 iorder[2] = 1;
4915 iorder[3] = 0;
4916 }
4917
4918 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4922 Py_UCS4 ch2 = *s;
4923 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4924 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4925 s++;
4926 size--;
4927 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004928 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929#endif
4930 STORECHAR(ch);
4931 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004932
4933 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004934 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935#undef STORECHAR
4936}
4937
Alexander Belopolsky40018472011-02-26 01:02:56 +00004938PyObject *
4939PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
4941 if (!PyUnicode_Check(unicode)) {
4942 PyErr_BadArgument();
4943 return NULL;
4944 }
4945 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 PyUnicode_GET_SIZE(unicode),
4947 NULL,
4948 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949}
4950
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951/* --- UTF-16 Codec ------------------------------------------------------- */
4952
Tim Peters772747b2001-08-09 22:21:55 +00004953PyObject *
4954PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958{
Walter Dörwald69652032004-09-07 20:24:22 +00004959 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4960}
4961
Antoine Pitrouab868312009-01-10 15:40:25 +00004962/* Two masks for fast checking of whether a C 'long' may contain
4963 UTF16-encoded surrogate characters. This is an efficient heuristic,
4964 assuming that non-surrogate characters with a code point >= 0x8000 are
4965 rare in most input.
4966 FAST_CHAR_MASK is used when the input is in native byte ordering,
4967 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004968*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004969#if (SIZEOF_LONG == 8)
4970# define FAST_CHAR_MASK 0x8000800080008000L
4971# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4972#elif (SIZEOF_LONG == 4)
4973# define FAST_CHAR_MASK 0x80008000L
4974# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4975#else
4976# error C 'long' size should be either 4 or 8!
4977#endif
4978
Walter Dörwald69652032004-09-07 20:24:22 +00004979PyObject *
4980PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 Py_ssize_t size,
4982 const char *errors,
4983 int *byteorder,
4984 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004987 Py_ssize_t startinpos;
4988 Py_ssize_t endinpos;
4989 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 PyUnicodeObject *unicode;
4991 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004992 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004993 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004994 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004995 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004996 /* Offsets from q for retrieving byte pairs in the right order. */
4997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4998 int ihi = 1, ilo = 0;
4999#else
5000 int ihi = 0, ilo = 1;
5001#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005002 PyObject *errorHandler = NULL;
5003 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004
5005 /* Note: size will always be longer than the resulting Unicode
5006 character count */
5007 unicode = _PyUnicode_New(size);
5008 if (!unicode)
5009 return NULL;
5010 if (size == 0)
5011 return (PyObject *)unicode;
5012
5013 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005014 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005015 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005016 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017
5018 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005019 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005021 /* Check for BOM marks (U+FEFF) in the input and adjust current
5022 byte order setting accordingly. In native mode, the leading BOM
5023 mark is skipped, in all other modes, it is copied to the output
5024 stream as-is (giving a ZWNBSP character). */
5025 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005026 if (size >= 2) {
5027 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005028#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 if (bom == 0xFEFF) {
5030 q += 2;
5031 bo = -1;
5032 }
5033 else if (bom == 0xFFFE) {
5034 q += 2;
5035 bo = 1;
5036 }
Tim Petersced69f82003-09-16 20:30:58 +00005037#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 if (bom == 0xFEFF) {
5039 q += 2;
5040 bo = 1;
5041 }
5042 else if (bom == 0xFFFE) {
5043 q += 2;
5044 bo = -1;
5045 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005046#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049
Tim Peters772747b2001-08-09 22:21:55 +00005050 if (bo == -1) {
5051 /* force LE */
5052 ihi = 1;
5053 ilo = 0;
5054 }
5055 else if (bo == 1) {
5056 /* force BE */
5057 ihi = 0;
5058 ilo = 1;
5059 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005060#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5061 native_ordering = ilo < ihi;
5062#else
5063 native_ordering = ilo > ihi;
5064#endif
Tim Peters772747b2001-08-09 22:21:55 +00005065
Antoine Pitrouab868312009-01-10 15:40:25 +00005066 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005067 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005069 /* First check for possible aligned read of a C 'long'. Unaligned
5070 reads are more expensive, better to defer to another iteration. */
5071 if (!((size_t) q & LONG_PTR_MASK)) {
5072 /* Fast path for runs of non-surrogate chars. */
5073 register const unsigned char *_q = q;
5074 Py_UNICODE *_p = p;
5075 if (native_ordering) {
5076 /* Native ordering is simple: as long as the input cannot
5077 possibly contain a surrogate char, do an unrolled copy
5078 of several 16-bit code points to the target object.
5079 The non-surrogate check is done on several input bytes
5080 at a time (as many as a C 'long' can contain). */
5081 while (_q < aligned_end) {
5082 unsigned long data = * (unsigned long *) _q;
5083 if (data & FAST_CHAR_MASK)
5084 break;
5085 _p[0] = ((unsigned short *) _q)[0];
5086 _p[1] = ((unsigned short *) _q)[1];
5087#if (SIZEOF_LONG == 8)
5088 _p[2] = ((unsigned short *) _q)[2];
5089 _p[3] = ((unsigned short *) _q)[3];
5090#endif
5091 _q += SIZEOF_LONG;
5092 _p += SIZEOF_LONG / 2;
5093 }
5094 }
5095 else {
5096 /* Byteswapped ordering is similar, but we must decompose
5097 the copy bytewise, and take care of zero'ing out the
5098 upper bytes if the target object is in 32-bit units
5099 (that is, in UCS-4 builds). */
5100 while (_q < aligned_end) {
5101 unsigned long data = * (unsigned long *) _q;
5102 if (data & SWAPPED_FAST_CHAR_MASK)
5103 break;
5104 /* Zero upper bytes in UCS-4 builds */
5105#if (Py_UNICODE_SIZE > 2)
5106 _p[0] = 0;
5107 _p[1] = 0;
5108#if (SIZEOF_LONG == 8)
5109 _p[2] = 0;
5110 _p[3] = 0;
5111#endif
5112#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005113 /* Issue #4916; UCS-4 builds on big endian machines must
5114 fill the two last bytes of each 4-byte unit. */
5115#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5116# define OFF 2
5117#else
5118# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005119#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005120 ((unsigned char *) _p)[OFF + 1] = _q[0];
5121 ((unsigned char *) _p)[OFF + 0] = _q[1];
5122 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5123 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5124#if (SIZEOF_LONG == 8)
5125 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5126 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5127 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5128 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5129#endif
5130#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005131 _q += SIZEOF_LONG;
5132 _p += SIZEOF_LONG / 2;
5133 }
5134 }
5135 p = _p;
5136 q = _q;
5137 if (q >= e)
5138 break;
5139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005140 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005141
Benjamin Peterson14339b62009-01-31 16:36:08 +00005142 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005143
5144 if (ch < 0xD800 || ch > 0xDFFF) {
5145 *p++ = ch;
5146 continue;
5147 }
5148
5149 /* UTF-16 code pair: */
5150 if (q > e) {
5151 errmsg = "unexpected end of data";
5152 startinpos = (((const char *)q) - 2) - starts;
5153 endinpos = ((const char *)e) + 1 - starts;
5154 goto utf16Error;
5155 }
5156 if (0xD800 <= ch && ch <= 0xDBFF) {
5157 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5158 q += 2;
5159 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005160#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 *p++ = ch;
5162 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005163#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005164 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005165#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 continue;
5167 }
5168 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005169 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 startinpos = (((const char *)q)-4)-starts;
5171 endinpos = startinpos+2;
5172 goto utf16Error;
5173 }
5174
Benjamin Peterson14339b62009-01-31 16:36:08 +00005175 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 errmsg = "illegal encoding";
5177 startinpos = (((const char *)q)-2)-starts;
5178 endinpos = startinpos+2;
5179 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005180
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 utf16Error:
5182 outpos = p - PyUnicode_AS_UNICODE(unicode);
5183 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005184 errors,
5185 &errorHandler,
5186 "utf16", errmsg,
5187 &starts,
5188 (const char **)&e,
5189 &startinpos,
5190 &endinpos,
5191 &exc,
5192 (const char **)&q,
5193 &unicode,
5194 &outpos,
5195 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005198 /* remaining byte at the end? (size should be even) */
5199 if (e == q) {
5200 if (!consumed) {
5201 errmsg = "truncated data";
5202 startinpos = ((const char *)q) - starts;
5203 endinpos = ((const char *)e) + 1 - starts;
5204 outpos = p - PyUnicode_AS_UNICODE(unicode);
5205 if (unicode_decode_call_errorhandler(
5206 errors,
5207 &errorHandler,
5208 "utf16", errmsg,
5209 &starts,
5210 (const char **)&e,
5211 &startinpos,
5212 &endinpos,
5213 &exc,
5214 (const char **)&q,
5215 &unicode,
5216 &outpos,
5217 &p))
5218 goto onError;
5219 /* The remaining input chars are ignored if the callback
5220 chooses to skip the input */
5221 }
5222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
5224 if (byteorder)
5225 *byteorder = bo;
5226
Walter Dörwald69652032004-09-07 20:24:22 +00005227 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005229
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005231 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 goto onError;
5233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 Py_XDECREF(errorHandler);
5235 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005236#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005237 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005238 Py_DECREF(unicode);
5239 return NULL;
5240 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005241#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005242 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 return (PyObject *)unicode;
5244
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005247 Py_XDECREF(errorHandler);
5248 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 return NULL;
5250}
5251
Antoine Pitrouab868312009-01-10 15:40:25 +00005252#undef FAST_CHAR_MASK
5253#undef SWAPPED_FAST_CHAR_MASK
5254
Tim Peters772747b2001-08-09 22:21:55 +00005255PyObject *
5256PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 Py_ssize_t size,
5258 const char *errors,
5259 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005261 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005262 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005263 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005264#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005265 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005266#else
5267 const int pairs = 0;
5268#endif
Tim Peters772747b2001-08-09 22:21:55 +00005269 /* Offsets from p for storing byte pairs in the right order. */
5270#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5271 int ihi = 1, ilo = 0;
5272#else
5273 int ihi = 0, ilo = 1;
5274#endif
5275
Benjamin Peterson29060642009-01-31 22:14:21 +00005276#define STORECHAR(CH) \
5277 do { \
5278 p[ihi] = ((CH) >> 8) & 0xff; \
5279 p[ilo] = (CH) & 0xff; \
5280 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005281 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005283#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005284 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 if (s[i] >= 0x10000)
5286 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005287#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005288 /* 2 * (size + pairs + (byteorder == 0)) */
5289 if (size > PY_SSIZE_T_MAX ||
5290 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005292 nsize = size + pairs + (byteorder == 0);
5293 bytesize = nsize * 2;
5294 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005296 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 if (v == NULL)
5298 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005300 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005303 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005304 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005305
5306 if (byteorder == -1) {
5307 /* force LE */
5308 ihi = 1;
5309 ilo = 0;
5310 }
5311 else if (byteorder == 1) {
5312 /* force BE */
5313 ihi = 0;
5314 ilo = 1;
5315 }
5316
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005317 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 Py_UNICODE ch = *s++;
5319 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005320#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 if (ch >= 0x10000) {
5322 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5323 ch = 0xD800 | ((ch-0x10000) >> 10);
5324 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005325#endif
Tim Peters772747b2001-08-09 22:21:55 +00005326 STORECHAR(ch);
5327 if (ch2)
5328 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005329 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005330
5331 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005332 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005333#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334}
5335
Alexander Belopolsky40018472011-02-26 01:02:56 +00005336PyObject *
5337PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338{
5339 if (!PyUnicode_Check(unicode)) {
5340 PyErr_BadArgument();
5341 return NULL;
5342 }
5343 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 PyUnicode_GET_SIZE(unicode),
5345 NULL,
5346 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347}
5348
5349/* --- Unicode Escape Codec ----------------------------------------------- */
5350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005351/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5352 if all the escapes in the string make it still a valid ASCII string.
5353 Returns -1 if any escapes were found which cause the string to
5354 pop out of ASCII range. Otherwise returns the length of the
5355 required buffer to hold the string.
5356 */
5357Py_ssize_t
5358length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5359{
5360 const unsigned char *p = (const unsigned char *)s;
5361 const unsigned char *end = p + size;
5362 Py_ssize_t length = 0;
5363
5364 if (size < 0)
5365 return -1;
5366
5367 for (; p < end; ++p) {
5368 if (*p > 127) {
5369 /* Non-ASCII */
5370 return -1;
5371 }
5372 else if (*p != '\\') {
5373 /* Normal character */
5374 ++length;
5375 }
5376 else {
5377 /* Backslash-escape, check next char */
5378 ++p;
5379 /* Escape sequence reaches till end of string or
5380 non-ASCII follow-up. */
5381 if (p >= end || *p > 127)
5382 return -1;
5383 switch (*p) {
5384 case '\n':
5385 /* backslash + \n result in zero characters */
5386 break;
5387 case '\\': case '\'': case '\"':
5388 case 'b': case 'f': case 't':
5389 case 'n': case 'r': case 'v': case 'a':
5390 ++length;
5391 break;
5392 case '0': case '1': case '2': case '3':
5393 case '4': case '5': case '6': case '7':
5394 case 'x': case 'u': case 'U': case 'N':
5395 /* these do not guarantee ASCII characters */
5396 return -1;
5397 default:
5398 /* count the backslash + the other character */
5399 length += 2;
5400 }
5401 }
5402 }
5403 return length;
5404}
5405
5406/* Similar to PyUnicode_WRITE but either write into wstr field
5407 or treat string as ASCII. */
5408#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5409 do { \
5410 if ((kind) != PyUnicode_WCHAR_KIND) \
5411 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5412 else \
5413 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5414 } while (0)
5415
5416#define WRITE_WSTR(buf, index, value) \
5417 assert(kind == PyUnicode_WCHAR_KIND), \
5418 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5419
5420
Fredrik Lundh06d12682001-01-24 07:59:11 +00005421static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005422
Alexander Belopolsky40018472011-02-26 01:02:56 +00005423PyObject *
5424PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005425 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005426 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005429 Py_ssize_t startinpos;
5430 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005433 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005435 char* message;
5436 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 PyObject *errorHandler = NULL;
5438 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 Py_ssize_t ascii_length;
5440 Py_ssize_t i;
5441 int kind;
5442 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444 ascii_length = length_of_escaped_ascii_string(s, size);
5445
5446 /* After length_of_escaped_ascii_string() there are two alternatives,
5447 either the string is pure ASCII with named escapes like \n, etc.
5448 and we determined it's exact size (common case)
5449 or it contains \x, \u, ... escape sequences. then we create a
5450 legacy wchar string and resize it at the end of this function. */
5451 if (ascii_length >= 0) {
5452 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5453 if (!v)
5454 goto onError;
5455 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5456 kind = PyUnicode_1BYTE_KIND;
5457 data = PyUnicode_DATA(v);
5458 }
5459 else {
5460 /* Escaped strings will always be longer than the resulting
5461 Unicode string, so we start with size here and then reduce the
5462 length after conversion to the true value.
5463 (but if the error callback returns a long replacement string
5464 we'll have to allocate more space) */
5465 v = _PyUnicode_New(size);
5466 if (!v)
5467 goto onError;
5468 kind = PyUnicode_WCHAR_KIND;
5469 data = PyUnicode_AS_UNICODE(v);
5470 }
5471
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 if (size == 0)
5473 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005476
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477 while (s < end) {
5478 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005479 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 if (kind == PyUnicode_WCHAR_KIND) {
5483 assert(i < _PyUnicode_WSTR_LENGTH(v));
5484 }
5485 else {
5486 /* The only case in which i == ascii_length is a backslash
5487 followed by a newline. */
5488 assert(i <= ascii_length);
5489 }
5490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 /* Non-escape characters are interpreted as Unicode ordinals */
5492 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005493 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 continue;
5495 }
5496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005497 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 /* \ - Escapes */
5499 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005500 c = *s++;
5501 if (s > end)
5502 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005503
5504 if (kind == PyUnicode_WCHAR_KIND) {
5505 assert(i < _PyUnicode_WSTR_LENGTH(v));
5506 }
5507 else {
5508 /* The only case in which i == ascii_length is a backslash
5509 followed by a newline. */
5510 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5511 }
5512
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005513 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005517 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5518 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5519 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5520 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5521 /* FF */
5522 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5523 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5524 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5525 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5526 /* VT */
5527 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5528 /* BEL, not classic C */
5529 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 case '0': case '1': case '2': case '3':
5533 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005534 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005535 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005536 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005537 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005538 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 break;
5542
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 /* hex escapes */
5544 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 digits = 2;
5547 message = "truncated \\xXX escape";
5548 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005552 digits = 4;
5553 message = "truncated \\uXXXX escape";
5554 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005557 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005558 digits = 8;
5559 message = "truncated \\UXXXXXXXX escape";
5560 hexescape:
5561 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005563 if (s+digits>end) {
5564 endinpos = size;
5565 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 errors, &errorHandler,
5567 "unicodeescape", "end of string in escape sequence",
5568 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572 goto nextByte;
5573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005574 for (j = 0; j < digits; ++j) {
5575 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005576 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 endinpos = (s+j+1)-starts;
5578 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005579 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 errors, &errorHandler,
5581 "unicodeescape", message,
5582 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005584 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005587 }
5588 chr = (chr<<4) & ~0xF;
5589 if (c >= '0' && c <= '9')
5590 chr += c - '0';
5591 else if (c >= 'a' && c <= 'f')
5592 chr += 10 + c - 'a';
5593 else
5594 chr += 10 + c - 'A';
5595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005597 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 /* _decoding_error will have already written into the
5599 target buffer. */
5600 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005601 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005602 /* when we get here, chr is a 32-bit unicode character */
5603 if (chr <= 0xffff)
5604 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005605 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005606 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005607 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005608 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005609#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005611#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005612 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5614 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005615#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005616 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 errors, &errorHandler,
5621 "unicodeescape", "illegal Unicode character",
5622 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005623 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005624 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005626 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005627 break;
5628
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630 case 'N':
5631 message = "malformed \\N character escape";
5632 if (ucnhash_CAPI == NULL) {
5633 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005634 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5635 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005636 if (ucnhash_CAPI == NULL)
5637 goto ucnhashError;
5638 }
5639 if (*s == '{') {
5640 const char *start = s+1;
5641 /* look for the closing brace */
5642 while (*s != '}' && s < end)
5643 s++;
5644 if (s > start && s < end && *s == '}') {
5645 /* found a name. look it up in the unicode database */
5646 message = "unknown Unicode character name";
5647 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5649 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005650 goto store;
5651 }
5652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 errors, &errorHandler,
5657 "unicodeescape", message,
5658 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005660 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005662 break;
5663
5664 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005665 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667 message = "\\ at end of string";
5668 s--;
5669 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 errors, &errorHandler,
5673 "unicodeescape", message,
5674 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005676 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005678 }
5679 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5681 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005682 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005683 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 /* Ensure the length prediction worked in case of ASCII strings */
5689 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5690
Victor Stinnerfe226c02011-10-03 03:52:20 +02005691 if (kind == PyUnicode_WCHAR_KIND)
5692 {
5693 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5694 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005695 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005696 Py_XDECREF(errorHandler);
5697 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005698#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005699 if (_PyUnicode_READY_REPLACE(&v)) {
5700 Py_DECREF(v);
5701 return NULL;
5702 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005703#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005704 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005706
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005708 PyErr_SetString(
5709 PyExc_UnicodeError,
5710 "\\N escapes not supported (can't load unicodedata module)"
5711 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005712 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 Py_XDECREF(errorHandler);
5714 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005715 return NULL;
5716
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719 Py_XDECREF(errorHandler);
5720 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 return NULL;
5722}
5723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005724#undef WRITE_ASCII_OR_WSTR
5725#undef WRITE_WSTR
5726
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727/* Return a Unicode-Escape string version of the Unicode object.
5728
5729 If quotes is true, the string is enclosed in u"" or u'' quotes as
5730 appropriate.
5731
5732*/
5733
Walter Dörwald79e913e2007-05-12 11:08:06 +00005734static const char *hexdigits = "0123456789abcdef";
5735
Alexander Belopolsky40018472011-02-26 01:02:56 +00005736PyObject *
5737PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005738 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005740 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005743#ifdef Py_UNICODE_WIDE
5744 const Py_ssize_t expandsize = 10;
5745#else
5746 const Py_ssize_t expandsize = 6;
5747#endif
5748
Thomas Wouters89f507f2006-12-13 04:49:30 +00005749 /* XXX(nnorwitz): rather than over-allocating, it would be
5750 better to choose a different scheme. Perhaps scan the
5751 first N-chars of the string and allocate based on that size.
5752 */
5753 /* Initial allocation is based on the longest-possible unichr
5754 escape.
5755
5756 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5757 unichr, so in this case it's the longest unichr escape. In
5758 narrow (UTF-16) builds this is five chars per source unichr
5759 since there are two unichrs in the surrogate pair, so in narrow
5760 (UTF-16) builds it's not the longest unichr escape.
5761
5762 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5763 so in the narrow (UTF-16) build case it's the longest unichr
5764 escape.
5765 */
5766
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005767 if (size == 0)
5768 return PyBytes_FromStringAndSize(NULL, 0);
5769
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005770 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005772
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005773 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 2
5775 + expandsize*size
5776 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 if (repr == NULL)
5778 return NULL;
5779
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005780 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 while (size-- > 0) {
5783 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005784
Walter Dörwald79e913e2007-05-12 11:08:06 +00005785 /* Escape backslashes */
5786 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 *p++ = '\\';
5788 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005789 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005790 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005791
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005792#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005793 /* Map 21-bit characters to '\U00xxxxxx' */
5794 else if (ch >= 0x10000) {
5795 *p++ = '\\';
5796 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005797 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5798 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5799 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5800 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5801 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5802 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5803 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5804 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005806 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005807#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5809 else if (ch >= 0xD800 && ch < 0xDC00) {
5810 Py_UNICODE ch2;
5811 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005812
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 ch2 = *s++;
5814 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005815 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5817 *p++ = '\\';
5818 *p++ = 'U';
5819 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5820 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5821 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5822 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5823 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5824 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5825 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5826 *p++ = hexdigits[ucs & 0x0000000F];
5827 continue;
5828 }
5829 /* Fall through: isolated surrogates are copied as-is */
5830 s--;
5831 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005833#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005834
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005836 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 *p++ = '\\';
5838 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005839 *p++ = hexdigits[(ch >> 12) & 0x000F];
5840 *p++ = hexdigits[(ch >> 8) & 0x000F];
5841 *p++ = hexdigits[(ch >> 4) & 0x000F];
5842 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005844
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005845 /* Map special whitespace to '\t', \n', '\r' */
5846 else if (ch == '\t') {
5847 *p++ = '\\';
5848 *p++ = 't';
5849 }
5850 else if (ch == '\n') {
5851 *p++ = '\\';
5852 *p++ = 'n';
5853 }
5854 else if (ch == '\r') {
5855 *p++ = '\\';
5856 *p++ = 'r';
5857 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005858
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005859 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005860 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005862 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005863 *p++ = hexdigits[(ch >> 4) & 0x000F];
5864 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005865 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005866
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 /* Copy everything else as-is */
5868 else
5869 *p++ = (char) ch;
5870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 assert(p - PyBytes_AS_STRING(repr) > 0);
5873 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5874 return NULL;
5875 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876}
5877
Alexander Belopolsky40018472011-02-26 01:02:56 +00005878PyObject *
5879PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005881 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 if (!PyUnicode_Check(unicode)) {
5883 PyErr_BadArgument();
5884 return NULL;
5885 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005886 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5887 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005888 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889}
5890
5891/* --- Raw Unicode Escape Codec ------------------------------------------- */
5892
Alexander Belopolsky40018472011-02-26 01:02:56 +00005893PyObject *
5894PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005895 Py_ssize_t size,
5896 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005899 Py_ssize_t startinpos;
5900 Py_ssize_t endinpos;
5901 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 const char *end;
5905 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 PyObject *errorHandler = NULL;
5907 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005908
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 /* Escaped strings will always be longer than the resulting
5910 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 length after conversion to the true value. (But decoding error
5912 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 v = _PyUnicode_New(size);
5914 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 end = s + size;
5920 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 unsigned char c;
5922 Py_UCS4 x;
5923 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005924 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 /* Non-escape characters are interpreted as Unicode ordinals */
5927 if (*s != '\\') {
5928 *p++ = (unsigned char)*s++;
5929 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 startinpos = s-starts;
5932
5933 /* \u-escapes are only interpreted iff the number of leading
5934 backslashes if odd */
5935 bs = s;
5936 for (;s < end;) {
5937 if (*s != '\\')
5938 break;
5939 *p++ = (unsigned char)*s++;
5940 }
5941 if (((s - bs) & 1) == 0 ||
5942 s >= end ||
5943 (*s != 'u' && *s != 'U')) {
5944 continue;
5945 }
5946 p--;
5947 count = *s=='u' ? 4 : 8;
5948 s++;
5949
5950 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5951 outpos = p-PyUnicode_AS_UNICODE(v);
5952 for (x = 0, i = 0; i < count; ++i, ++s) {
5953 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005954 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 endinpos = s-starts;
5956 if (unicode_decode_call_errorhandler(
5957 errors, &errorHandler,
5958 "rawunicodeescape", "truncated \\uXXXX",
5959 &starts, &end, &startinpos, &endinpos, &exc, &s,
5960 &v, &outpos, &p))
5961 goto onError;
5962 goto nextByte;
5963 }
5964 x = (x<<4) & ~0xF;
5965 if (c >= '0' && c <= '9')
5966 x += c - '0';
5967 else if (c >= 'a' && c <= 'f')
5968 x += 10 + c - 'a';
5969 else
5970 x += 10 + c - 'A';
5971 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005972 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 /* UCS-2 character */
5974 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005975 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* UCS-4 character. Either store directly, or as
5977 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005978#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005980#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 x -= 0x10000L;
5982 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5983 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005984#endif
5985 } else {
5986 endinpos = s-starts;
5987 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005988 if (unicode_decode_call_errorhandler(
5989 errors, &errorHandler,
5990 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 &starts, &end, &startinpos, &endinpos, &exc, &s,
5992 &v, &outpos, &p))
5993 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 nextByte:
5996 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005998 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 Py_XDECREF(errorHandler);
6001 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006002#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006003 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 Py_DECREF(v);
6005 return NULL;
6006 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006007#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006008 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006013 Py_XDECREF(errorHandler);
6014 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 return NULL;
6016}
6017
Alexander Belopolsky40018472011-02-26 01:02:56 +00006018PyObject *
6019PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006020 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006022 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 char *p;
6024 char *q;
6025
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006026#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006027 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006028#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006029 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006030#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006031
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006032 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 if (repr == NULL)
6037 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006038 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006039 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006041 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 while (size-- > 0) {
6043 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006044#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 /* Map 32-bit characters to '\Uxxxxxxxx' */
6046 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006047 *p++ = '\\';
6048 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006049 *p++ = hexdigits[(ch >> 28) & 0xf];
6050 *p++ = hexdigits[(ch >> 24) & 0xf];
6051 *p++ = hexdigits[(ch >> 20) & 0xf];
6052 *p++ = hexdigits[(ch >> 16) & 0xf];
6053 *p++ = hexdigits[(ch >> 12) & 0xf];
6054 *p++ = hexdigits[(ch >> 8) & 0xf];
6055 *p++ = hexdigits[(ch >> 4) & 0xf];
6056 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006057 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006058 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006059#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6061 if (ch >= 0xD800 && ch < 0xDC00) {
6062 Py_UNICODE ch2;
6063 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006064
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 ch2 = *s++;
6066 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006067 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6069 *p++ = '\\';
6070 *p++ = 'U';
6071 *p++ = hexdigits[(ucs >> 28) & 0xf];
6072 *p++ = hexdigits[(ucs >> 24) & 0xf];
6073 *p++ = hexdigits[(ucs >> 20) & 0xf];
6074 *p++ = hexdigits[(ucs >> 16) & 0xf];
6075 *p++ = hexdigits[(ucs >> 12) & 0xf];
6076 *p++ = hexdigits[(ucs >> 8) & 0xf];
6077 *p++ = hexdigits[(ucs >> 4) & 0xf];
6078 *p++ = hexdigits[ucs & 0xf];
6079 continue;
6080 }
6081 /* Fall through: isolated surrogates are copied as-is */
6082 s--;
6083 size++;
6084 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 /* Map 16-bit characters to '\uxxxx' */
6087 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 *p++ = '\\';
6089 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006090 *p++ = hexdigits[(ch >> 12) & 0xf];
6091 *p++ = hexdigits[(ch >> 8) & 0xf];
6092 *p++ = hexdigits[(ch >> 4) & 0xf];
6093 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 /* Copy everything else as-is */
6096 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 *p++ = (char) ch;
6098 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006099 size = p - q;
6100
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006101 assert(size > 0);
6102 if (_PyBytes_Resize(&repr, size) < 0)
6103 return NULL;
6104 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105}
6106
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107PyObject *
6108PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006110 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006112 PyErr_BadArgument();
6113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006115 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6116 PyUnicode_GET_SIZE(unicode));
6117
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006118 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121/* --- Unicode Internal Codec ------------------------------------------- */
6122
Alexander Belopolsky40018472011-02-26 01:02:56 +00006123PyObject *
6124_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006125 Py_ssize_t size,
6126 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006127{
6128 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t startinpos;
6130 Py_ssize_t endinpos;
6131 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006132 PyUnicodeObject *v;
6133 Py_UNICODE *p;
6134 const char *end;
6135 const char *reason;
6136 PyObject *errorHandler = NULL;
6137 PyObject *exc = NULL;
6138
Neal Norwitzd43069c2006-01-08 01:12:10 +00006139#ifdef Py_UNICODE_WIDE
6140 Py_UNICODE unimax = PyUnicode_GetMax();
6141#endif
6142
Thomas Wouters89f507f2006-12-13 04:49:30 +00006143 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006144 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6145 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006147 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6148 as string was created with the old API. */
6149 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006151 p = PyUnicode_AS_UNICODE(v);
6152 end = s + size;
6153
6154 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006155 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006156 /* We have to sanity check the raw data, otherwise doom looms for
6157 some malformed UCS-4 data. */
6158 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006159#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006160 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006161#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006162 end-s < Py_UNICODE_SIZE
6163 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006165 startinpos = s - starts;
6166 if (end-s < Py_UNICODE_SIZE) {
6167 endinpos = end-starts;
6168 reason = "truncated input";
6169 }
6170 else {
6171 endinpos = s - starts + Py_UNICODE_SIZE;
6172 reason = "illegal code point (> 0x10FFFF)";
6173 }
6174 outpos = p - PyUnicode_AS_UNICODE(v);
6175 if (unicode_decode_call_errorhandler(
6176 errors, &errorHandler,
6177 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006178 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006179 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006180 goto onError;
6181 }
6182 }
6183 else {
6184 p++;
6185 s += Py_UNICODE_SIZE;
6186 }
6187 }
6188
Victor Stinnerfe226c02011-10-03 03:52:20 +02006189 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006190 goto onError;
6191 Py_XDECREF(errorHandler);
6192 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006193#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006194 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006195 Py_DECREF(v);
6196 return NULL;
6197 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006198#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006199 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006200 return (PyObject *)v;
6201
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203 Py_XDECREF(v);
6204 Py_XDECREF(errorHandler);
6205 Py_XDECREF(exc);
6206 return NULL;
6207}
6208
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209/* --- Latin-1 Codec ------------------------------------------------------ */
6210
Alexander Belopolsky40018472011-02-26 01:02:56 +00006211PyObject *
6212PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006213 Py_ssize_t size,
6214 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006217 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006221static void
6222make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006223 const char *encoding,
6224 const Py_UNICODE *unicode, Py_ssize_t size,
6225 Py_ssize_t startpos, Py_ssize_t endpos,
6226 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 *exceptionObject = PyUnicodeEncodeError_Create(
6230 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 }
6232 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6234 goto onError;
6235 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6236 goto onError;
6237 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6238 goto onError;
6239 return;
6240 onError:
6241 Py_DECREF(*exceptionObject);
6242 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 }
6244}
6245
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006247static void
6248raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006249 const char *encoding,
6250 const Py_UNICODE *unicode, Py_ssize_t size,
6251 Py_ssize_t startpos, Py_ssize_t endpos,
6252 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006253{
6254 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258}
6259
6260/* error handling callback helper:
6261 build arguments, call the callback and check the arguments,
6262 put the result into newpos and return the replacement string, which
6263 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006264static PyObject *
6265unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006266 PyObject **errorHandler,
6267 const char *encoding, const char *reason,
6268 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6269 Py_ssize_t startpos, Py_ssize_t endpos,
6270 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006272 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006273
6274 PyObject *restuple;
6275 PyObject *resunicode;
6276
6277 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 }
6282
6283 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287
6288 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006293 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 Py_DECREF(restuple);
6295 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006297 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 &resunicode, newpos)) {
6299 Py_DECREF(restuple);
6300 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006302 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6303 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6304 Py_DECREF(restuple);
6305 return NULL;
6306 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006309 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6311 Py_DECREF(restuple);
6312 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006313 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314 Py_INCREF(resunicode);
6315 Py_DECREF(restuple);
6316 return resunicode;
6317}
6318
Alexander Belopolsky40018472011-02-26 01:02:56 +00006319static PyObject *
6320unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006321 Py_ssize_t size,
6322 const char *errors,
6323 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324{
6325 /* output object */
6326 PyObject *res;
6327 /* pointers to the beginning and end+1 of input */
6328 const Py_UNICODE *startp = p;
6329 const Py_UNICODE *endp = p + size;
6330 /* pointer to the beginning of the unencodable characters */
6331 /* const Py_UNICODE *badp = NULL; */
6332 /* pointer into the output */
6333 char *str;
6334 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006335 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006336 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6337 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 PyObject *errorHandler = NULL;
6339 PyObject *exc = NULL;
6340 /* the following variable is used for caching string comparisons
6341 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6342 int known_errorHandler = -1;
6343
6344 /* allocate enough for a simple encoding without
6345 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006346 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006347 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006348 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006350 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006351 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 ressize = size;
6353
6354 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 /* can we encode this? */
6358 if (c<limit) {
6359 /* no overflow check, because we know that the space is enough */
6360 *str++ = (char)c;
6361 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 else {
6364 Py_ssize_t unicodepos = p-startp;
6365 Py_ssize_t requiredsize;
6366 PyObject *repunicode;
6367 Py_ssize_t repsize;
6368 Py_ssize_t newpos;
6369 Py_ssize_t respos;
6370 Py_UNICODE *uni2;
6371 /* startpos for collecting unencodable chars */
6372 const Py_UNICODE *collstart = p;
6373 const Py_UNICODE *collend = p;
6374 /* find all unecodable characters */
6375 while ((collend < endp) && ((*collend)>=limit))
6376 ++collend;
6377 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6378 if (known_errorHandler==-1) {
6379 if ((errors==NULL) || (!strcmp(errors, "strict")))
6380 known_errorHandler = 1;
6381 else if (!strcmp(errors, "replace"))
6382 known_errorHandler = 2;
6383 else if (!strcmp(errors, "ignore"))
6384 known_errorHandler = 3;
6385 else if (!strcmp(errors, "xmlcharrefreplace"))
6386 known_errorHandler = 4;
6387 else
6388 known_errorHandler = 0;
6389 }
6390 switch (known_errorHandler) {
6391 case 1: /* strict */
6392 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6393 goto onError;
6394 case 2: /* replace */
6395 while (collstart++<collend)
6396 *str++ = '?'; /* fall through */
6397 case 3: /* ignore */
6398 p = collend;
6399 break;
6400 case 4: /* xmlcharrefreplace */
6401 respos = str - PyBytes_AS_STRING(res);
6402 /* determine replacement size (temporarily (mis)uses p) */
6403 for (p = collstart, repsize = 0; p < collend; ++p) {
6404 if (*p<10)
6405 repsize += 2+1+1;
6406 else if (*p<100)
6407 repsize += 2+2+1;
6408 else if (*p<1000)
6409 repsize += 2+3+1;
6410 else if (*p<10000)
6411 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006412#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 else
6414 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006415#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 else if (*p<100000)
6417 repsize += 2+5+1;
6418 else if (*p<1000000)
6419 repsize += 2+6+1;
6420 else
6421 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006422#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 }
6424 requiredsize = respos+repsize+(endp-collend);
6425 if (requiredsize > ressize) {
6426 if (requiredsize<2*ressize)
6427 requiredsize = 2*ressize;
6428 if (_PyBytes_Resize(&res, requiredsize))
6429 goto onError;
6430 str = PyBytes_AS_STRING(res) + respos;
6431 ressize = requiredsize;
6432 }
6433 /* generate replacement (temporarily (mis)uses p) */
6434 for (p = collstart; p < collend; ++p) {
6435 str += sprintf(str, "&#%d;", (int)*p);
6436 }
6437 p = collend;
6438 break;
6439 default:
6440 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6441 encoding, reason, startp, size, &exc,
6442 collstart-startp, collend-startp, &newpos);
6443 if (repunicode == NULL)
6444 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006445 if (PyBytes_Check(repunicode)) {
6446 /* Directly copy bytes result to output. */
6447 repsize = PyBytes_Size(repunicode);
6448 if (repsize > 1) {
6449 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006450 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006451 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6452 Py_DECREF(repunicode);
6453 goto onError;
6454 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006455 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006456 ressize += repsize-1;
6457 }
6458 memcpy(str, PyBytes_AsString(repunicode), repsize);
6459 str += repsize;
6460 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006461 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006462 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006463 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* need more space? (at least enough for what we
6465 have+the replacement+the rest of the string, so
6466 we won't have to check space for encodable characters) */
6467 respos = str - PyBytes_AS_STRING(res);
6468 repsize = PyUnicode_GET_SIZE(repunicode);
6469 requiredsize = respos+repsize+(endp-collend);
6470 if (requiredsize > ressize) {
6471 if (requiredsize<2*ressize)
6472 requiredsize = 2*ressize;
6473 if (_PyBytes_Resize(&res, requiredsize)) {
6474 Py_DECREF(repunicode);
6475 goto onError;
6476 }
6477 str = PyBytes_AS_STRING(res) + respos;
6478 ressize = requiredsize;
6479 }
6480 /* check if there is anything unencodable in the replacement
6481 and copy it to the output */
6482 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6483 c = *uni2;
6484 if (c >= limit) {
6485 raise_encode_exception(&exc, encoding, startp, size,
6486 unicodepos, unicodepos+1, reason);
6487 Py_DECREF(repunicode);
6488 goto onError;
6489 }
6490 *str = (char)c;
6491 }
6492 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006493 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006495 }
6496 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006497 /* Resize if we allocated to much */
6498 size = str - PyBytes_AS_STRING(res);
6499 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006500 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006501 if (_PyBytes_Resize(&res, size) < 0)
6502 goto onError;
6503 }
6504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 Py_XDECREF(errorHandler);
6506 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006507 return res;
6508
6509 onError:
6510 Py_XDECREF(res);
6511 Py_XDECREF(errorHandler);
6512 Py_XDECREF(exc);
6513 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514}
6515
Alexander Belopolsky40018472011-02-26 01:02:56 +00006516PyObject *
6517PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006518 Py_ssize_t size,
6519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006521 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522}
6523
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006525_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
6527 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 PyErr_BadArgument();
6529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006531 if (PyUnicode_READY(unicode) == -1)
6532 return NULL;
6533 /* Fast path: if it is a one-byte string, construct
6534 bytes object directly. */
6535 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6536 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6537 PyUnicode_GET_LENGTH(unicode));
6538 /* Non-Latin-1 characters present. Defer to above function to
6539 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006542 errors);
6543}
6544
6545PyObject*
6546PyUnicode_AsLatin1String(PyObject *unicode)
6547{
6548 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549}
6550
6551/* --- 7-bit ASCII Codec -------------------------------------------------- */
6552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553PyObject *
6554PyUnicode_DecodeASCII(const char *s,
6555 Py_ssize_t size,
6556 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006560 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006561 Py_ssize_t startinpos;
6562 Py_ssize_t endinpos;
6563 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006565 int has_error;
6566 const unsigned char *p = (const unsigned char *)s;
6567 const unsigned char *end = p + size;
6568 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 PyObject *errorHandler = NULL;
6570 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006571
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006573 if (size == 1 && (unsigned char)s[0] < 128)
6574 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006575
Victor Stinner702c7342011-10-05 13:50:52 +02006576 has_error = 0;
6577 while (p < end && !has_error) {
6578 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6579 an explanation. */
6580 if (!((size_t) p & LONG_PTR_MASK)) {
6581 /* Help register allocation */
6582 register const unsigned char *_p = p;
6583 while (_p < aligned_end) {
6584 unsigned long value = *(unsigned long *) _p;
6585 if (value & ASCII_CHAR_MASK) {
6586 has_error = 1;
6587 break;
6588 }
6589 _p += SIZEOF_LONG;
6590 }
6591 if (_p == end)
6592 break;
6593 if (has_error)
6594 break;
6595 p = _p;
6596 }
6597 if (*p & 0x80) {
6598 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006599 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006600 }
6601 else {
6602 ++p;
6603 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006604 }
Victor Stinner702c7342011-10-05 13:50:52 +02006605 if (!has_error)
6606 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006607
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 v = _PyUnicode_New(size);
6609 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006613 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 e = s + size;
6615 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 register unsigned char c = (unsigned char)*s;
6617 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006618 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 ++s;
6620 }
6621 else {
6622 startinpos = s-starts;
6623 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006624 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 if (unicode_decode_call_errorhandler(
6626 errors, &errorHandler,
6627 "ascii", "ordinal not in range(128)",
6628 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006629 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 goto onError;
6631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 }
Victor Stinner702c7342011-10-05 13:50:52 +02006633 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6634 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006636 Py_XDECREF(errorHandler);
6637 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006638#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006639 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640 Py_DECREF(v);
6641 return NULL;
6642 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006643#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006644 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006646
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 Py_XDECREF(errorHandler);
6650 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 return NULL;
6652}
6653
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
6655PyUnicode_EncodeASCII(const Py_UNICODE *p,
6656 Py_ssize_t size,
6657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660}
6661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664{
6665 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 PyErr_BadArgument();
6667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669 if (PyUnicode_READY(unicode) == -1)
6670 return NULL;
6671 /* Fast path: if it is an ASCII-only string, construct bytes object
6672 directly. Else defer to above function to raise the exception. */
6673 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6674 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6675 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678 errors);
6679}
6680
6681PyObject *
6682PyUnicode_AsASCIIString(PyObject *unicode)
6683{
6684 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Victor Stinner99b95382011-07-04 14:23:54 +02006687#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006688
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006689/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006690
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006691#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692#define NEED_RETRY
6693#endif
6694
6695/* XXX This code is limited to "true" double-byte encodings, as
6696 a) it assumes an incomplete character consists of a single byte, and
6697 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006699
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static int
6701is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702{
6703 const char *curr = s + offset;
6704
6705 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 const char *prev = CharPrev(s, curr);
6707 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708 }
6709 return 0;
6710}
6711
6712/*
6713 * Decode MBCS string into unicode object. If 'final' is set, converts
6714 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6715 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716static int
6717decode_mbcs(PyUnicodeObject **v,
6718 const char *s, /* MBCS string */
6719 int size, /* sizeof MBCS string */
6720 int final,
6721 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722{
6723 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006724 Py_ssize_t n;
6725 DWORD usize;
6726 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727
6728 assert(size >= 0);
6729
Victor Stinner554f3f02010-06-16 23:33:54 +00006730 /* check and handle 'errors' arg */
6731 if (errors==NULL || strcmp(errors, "strict")==0)
6732 flags = MB_ERR_INVALID_CHARS;
6733 else if (strcmp(errors, "ignore")==0)
6734 flags = 0;
6735 else {
6736 PyErr_Format(PyExc_ValueError,
6737 "mbcs encoding does not support errors='%s'",
6738 errors);
6739 return -1;
6740 }
6741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006742 /* Skip trailing lead-byte unless 'final' is set */
6743 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745
6746 /* First get the size of the result */
6747 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006748 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6749 if (usize==0)
6750 goto mbcs_decode_error;
6751 } else
6752 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006753
6754 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 /* Create unicode object */
6756 *v = _PyUnicode_New(usize);
6757 if (*v == NULL)
6758 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006759 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760 }
6761 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 /* Extend unicode object */
6763 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006764 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006766 }
6767
6768 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006769 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006771 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6772 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006775 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006776
6777mbcs_decode_error:
6778 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6779 we raise a UnicodeDecodeError - else it is a 'generic'
6780 windows error
6781 */
6782 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6783 /* Ideally, we should get reason from FormatMessage - this
6784 is the Windows 2000 English version of the message
6785 */
6786 PyObject *exc = NULL;
6787 const char *reason = "No mapping for the Unicode character exists "
6788 "in the target multi-byte code page.";
6789 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6790 if (exc != NULL) {
6791 PyCodec_StrictErrors(exc);
6792 Py_DECREF(exc);
6793 }
6794 } else {
6795 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6796 }
6797 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798}
6799
Alexander Belopolsky40018472011-02-26 01:02:56 +00006800PyObject *
6801PyUnicode_DecodeMBCSStateful(const char *s,
6802 Py_ssize_t size,
6803 const char *errors,
6804 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006805{
6806 PyUnicodeObject *v = NULL;
6807 int done;
6808
6809 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006811
6812#ifdef NEED_RETRY
6813 retry:
6814 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006815 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816 else
6817#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006818 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
6820 if (done < 0) {
6821 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823 }
6824
6825 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827
6828#ifdef NEED_RETRY
6829 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 s += done;
6831 size -= done;
6832 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006833 }
6834#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006835#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006836 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006837 Py_DECREF(v);
6838 return NULL;
6839 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006840#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006841 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842 return (PyObject *)v;
6843}
6844
Alexander Belopolsky40018472011-02-26 01:02:56 +00006845PyObject *
6846PyUnicode_DecodeMBCS(const char *s,
6847 Py_ssize_t size,
6848 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006849{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6851}
6852
6853/*
6854 * Convert unicode into string object (MBCS).
6855 * Returns 0 if succeed, -1 otherwise.
6856 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006857static int
6858encode_mbcs(PyObject **repr,
6859 const Py_UNICODE *p, /* unicode */
6860 int size, /* size of unicode */
6861 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862{
Victor Stinner554f3f02010-06-16 23:33:54 +00006863 BOOL usedDefaultChar = FALSE;
6864 BOOL *pusedDefaultChar;
6865 int mbcssize;
6866 Py_ssize_t n;
6867 PyObject *exc = NULL;
6868 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006869
6870 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006871
Victor Stinner554f3f02010-06-16 23:33:54 +00006872 /* check and handle 'errors' arg */
6873 if (errors==NULL || strcmp(errors, "strict")==0) {
6874 flags = WC_NO_BEST_FIT_CHARS;
6875 pusedDefaultChar = &usedDefaultChar;
6876 } else if (strcmp(errors, "replace")==0) {
6877 flags = 0;
6878 pusedDefaultChar = NULL;
6879 } else {
6880 PyErr_Format(PyExc_ValueError,
6881 "mbcs encoding does not support errors='%s'",
6882 errors);
6883 return -1;
6884 }
6885
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006886 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006888 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6889 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 if (mbcssize == 0) {
6891 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6892 return -1;
6893 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006894 /* If we used a default char, then we failed! */
6895 if (pusedDefaultChar && *pusedDefaultChar)
6896 goto mbcs_encode_error;
6897 } else {
6898 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006899 }
6900
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 /* Create string object */
6903 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6904 if (*repr == NULL)
6905 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006906 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907 }
6908 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 /* Extend string object */
6910 n = PyBytes_Size(*repr);
6911 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6912 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913 }
6914
6915 /* Do the conversion */
6916 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006918 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6919 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6921 return -1;
6922 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006923 if (pusedDefaultChar && *pusedDefaultChar)
6924 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006927
6928mbcs_encode_error:
6929 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6930 Py_XDECREF(exc);
6931 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006932}
6933
Alexander Belopolsky40018472011-02-26 01:02:56 +00006934PyObject *
6935PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6936 Py_ssize_t size,
6937 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006938{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006939 PyObject *repr = NULL;
6940 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006941
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006945 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946 else
6947#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006948 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 Py_XDECREF(repr);
6952 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006953 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954
6955#ifdef NEED_RETRY
6956 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 p += INT_MAX;
6958 size -= INT_MAX;
6959 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960 }
6961#endif
6962
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006963 return repr;
6964}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
6967PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006968{
6969 if (!PyUnicode_Check(unicode)) {
6970 PyErr_BadArgument();
6971 return NULL;
6972 }
6973 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 PyUnicode_GET_SIZE(unicode),
6975 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006976}
6977
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978#undef NEED_RETRY
6979
Victor Stinner99b95382011-07-04 14:23:54 +02006980#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006981
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982/* --- Character Mapping Codec -------------------------------------------- */
6983
Alexander Belopolsky40018472011-02-26 01:02:56 +00006984PyObject *
6985PyUnicode_DecodeCharmap(const char *s,
6986 Py_ssize_t size,
6987 PyObject *mapping,
6988 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006990 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006991 Py_ssize_t startinpos;
6992 Py_ssize_t endinpos;
6993 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 PyUnicodeObject *v;
6996 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006997 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 PyObject *errorHandler = NULL;
6999 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007000 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007001 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007002
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 /* Default to Latin-1 */
7004 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007005 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
7007 v = _PyUnicode_New(size);
7008 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007011 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007013 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007014 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 mapstring = PyUnicode_AS_UNICODE(mapping);
7016 maplen = PyUnicode_GET_SIZE(mapping);
7017 while (s < e) {
7018 unsigned char ch = *s;
7019 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 if (ch < maplen)
7022 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 if (x == 0xfffe) {
7025 /* undefined mapping */
7026 outpos = p-PyUnicode_AS_UNICODE(v);
7027 startinpos = s-starts;
7028 endinpos = startinpos+1;
7029 if (unicode_decode_call_errorhandler(
7030 errors, &errorHandler,
7031 "charmap", "character maps to <undefined>",
7032 &starts, &e, &startinpos, &endinpos, &exc, &s,
7033 &v, &outpos, &p)) {
7034 goto onError;
7035 }
7036 continue;
7037 }
7038 *p++ = x;
7039 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007040 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007041 }
7042 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 while (s < e) {
7044 unsigned char ch = *s;
7045 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007046
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7048 w = PyLong_FromLong((long)ch);
7049 if (w == NULL)
7050 goto onError;
7051 x = PyObject_GetItem(mapping, w);
7052 Py_DECREF(w);
7053 if (x == NULL) {
7054 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7055 /* No mapping found means: mapping is undefined. */
7056 PyErr_Clear();
7057 x = Py_None;
7058 Py_INCREF(x);
7059 } else
7060 goto onError;
7061 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007062
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 /* Apply mapping */
7064 if (PyLong_Check(x)) {
7065 long value = PyLong_AS_LONG(x);
7066 if (value < 0 || value > 65535) {
7067 PyErr_SetString(PyExc_TypeError,
7068 "character mapping must be in range(65536)");
7069 Py_DECREF(x);
7070 goto onError;
7071 }
7072 *p++ = (Py_UNICODE)value;
7073 }
7074 else if (x == Py_None) {
7075 /* undefined mapping */
7076 outpos = p-PyUnicode_AS_UNICODE(v);
7077 startinpos = s-starts;
7078 endinpos = startinpos+1;
7079 if (unicode_decode_call_errorhandler(
7080 errors, &errorHandler,
7081 "charmap", "character maps to <undefined>",
7082 &starts, &e, &startinpos, &endinpos, &exc, &s,
7083 &v, &outpos, &p)) {
7084 Py_DECREF(x);
7085 goto onError;
7086 }
7087 Py_DECREF(x);
7088 continue;
7089 }
7090 else if (PyUnicode_Check(x)) {
7091 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007092
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 if (targetsize == 1)
7094 /* 1-1 mapping */
7095 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007096
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 else if (targetsize > 1) {
7098 /* 1-n mapping */
7099 if (targetsize > extrachars) {
7100 /* resize first */
7101 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7102 Py_ssize_t needed = (targetsize - extrachars) + \
7103 (targetsize << 2);
7104 extrachars += needed;
7105 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007106 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 PyUnicode_GET_SIZE(v) + needed) < 0) {
7108 Py_DECREF(x);
7109 goto onError;
7110 }
7111 p = PyUnicode_AS_UNICODE(v) + oldpos;
7112 }
7113 Py_UNICODE_COPY(p,
7114 PyUnicode_AS_UNICODE(x),
7115 targetsize);
7116 p += targetsize;
7117 extrachars -= targetsize;
7118 }
7119 /* 1-0 mapping: skip the character */
7120 }
7121 else {
7122 /* wrong return value */
7123 PyErr_SetString(PyExc_TypeError,
7124 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007125 Py_DECREF(x);
7126 goto onError;
7127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 Py_DECREF(x);
7129 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
7132 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007133 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 Py_XDECREF(errorHandler);
7136 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007137#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007138 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007139 Py_DECREF(v);
7140 return NULL;
7141 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007142#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007143 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007145
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007147 Py_XDECREF(errorHandler);
7148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 Py_XDECREF(v);
7150 return NULL;
7151}
7152
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007153/* Charmap encoding: the lookup table */
7154
Alexander Belopolsky40018472011-02-26 01:02:56 +00007155struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 PyObject_HEAD
7157 unsigned char level1[32];
7158 int count2, count3;
7159 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007160};
7161
7162static PyObject*
7163encoding_map_size(PyObject *obj, PyObject* args)
7164{
7165 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007166 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168}
7169
7170static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007171 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 PyDoc_STR("Return the size (in bytes) of this object") },
7173 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007174};
7175
7176static void
7177encoding_map_dealloc(PyObject* o)
7178{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007179 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007180}
7181
7182static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007183 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 "EncodingMap", /*tp_name*/
7185 sizeof(struct encoding_map), /*tp_basicsize*/
7186 0, /*tp_itemsize*/
7187 /* methods */
7188 encoding_map_dealloc, /*tp_dealloc*/
7189 0, /*tp_print*/
7190 0, /*tp_getattr*/
7191 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007192 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 0, /*tp_repr*/
7194 0, /*tp_as_number*/
7195 0, /*tp_as_sequence*/
7196 0, /*tp_as_mapping*/
7197 0, /*tp_hash*/
7198 0, /*tp_call*/
7199 0, /*tp_str*/
7200 0, /*tp_getattro*/
7201 0, /*tp_setattro*/
7202 0, /*tp_as_buffer*/
7203 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7204 0, /*tp_doc*/
7205 0, /*tp_traverse*/
7206 0, /*tp_clear*/
7207 0, /*tp_richcompare*/
7208 0, /*tp_weaklistoffset*/
7209 0, /*tp_iter*/
7210 0, /*tp_iternext*/
7211 encoding_map_methods, /*tp_methods*/
7212 0, /*tp_members*/
7213 0, /*tp_getset*/
7214 0, /*tp_base*/
7215 0, /*tp_dict*/
7216 0, /*tp_descr_get*/
7217 0, /*tp_descr_set*/
7218 0, /*tp_dictoffset*/
7219 0, /*tp_init*/
7220 0, /*tp_alloc*/
7221 0, /*tp_new*/
7222 0, /*tp_free*/
7223 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007224};
7225
7226PyObject*
7227PyUnicode_BuildEncodingMap(PyObject* string)
7228{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007229 PyObject *result;
7230 struct encoding_map *mresult;
7231 int i;
7232 int need_dict = 0;
7233 unsigned char level1[32];
7234 unsigned char level2[512];
7235 unsigned char *mlevel1, *mlevel2, *mlevel3;
7236 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007237 int kind;
7238 void *data;
7239 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007241 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007242 PyErr_BadArgument();
7243 return NULL;
7244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007245 kind = PyUnicode_KIND(string);
7246 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007247 memset(level1, 0xFF, sizeof level1);
7248 memset(level2, 0xFF, sizeof level2);
7249
7250 /* If there isn't a one-to-one mapping of NULL to \0,
7251 or if there are non-BMP characters, we need to use
7252 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007254 need_dict = 1;
7255 for (i = 1; i < 256; i++) {
7256 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007257 ch = PyUnicode_READ(kind, data, i);
7258 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007259 need_dict = 1;
7260 break;
7261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007262 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007263 /* unmapped character */
7264 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 l1 = ch >> 11;
7266 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 if (level1[l1] == 0xFF)
7268 level1[l1] = count2++;
7269 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007270 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 }
7272
7273 if (count2 >= 0xFF || count3 >= 0xFF)
7274 need_dict = 1;
7275
7276 if (need_dict) {
7277 PyObject *result = PyDict_New();
7278 PyObject *key, *value;
7279 if (!result)
7280 return NULL;
7281 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007282 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007283 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007284 if (!key || !value)
7285 goto failed1;
7286 if (PyDict_SetItem(result, key, value) == -1)
7287 goto failed1;
7288 Py_DECREF(key);
7289 Py_DECREF(value);
7290 }
7291 return result;
7292 failed1:
7293 Py_XDECREF(key);
7294 Py_XDECREF(value);
7295 Py_DECREF(result);
7296 return NULL;
7297 }
7298
7299 /* Create a three-level trie */
7300 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7301 16*count2 + 128*count3 - 1);
7302 if (!result)
7303 return PyErr_NoMemory();
7304 PyObject_Init(result, &EncodingMapType);
7305 mresult = (struct encoding_map*)result;
7306 mresult->count2 = count2;
7307 mresult->count3 = count3;
7308 mlevel1 = mresult->level1;
7309 mlevel2 = mresult->level23;
7310 mlevel3 = mresult->level23 + 16*count2;
7311 memcpy(mlevel1, level1, 32);
7312 memset(mlevel2, 0xFF, 16*count2);
7313 memset(mlevel3, 0, 128*count3);
7314 count3 = 0;
7315 for (i = 1; i < 256; i++) {
7316 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007318 /* unmapped character */
7319 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007320 o1 = PyUnicode_READ(kind, data, i)>>11;
7321 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007322 i2 = 16*mlevel1[o1] + o2;
7323 if (mlevel2[i2] == 0xFF)
7324 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007326 i3 = 128*mlevel2[i2] + o3;
7327 mlevel3[i3] = i;
7328 }
7329 return result;
7330}
7331
7332static int
7333encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7334{
7335 struct encoding_map *map = (struct encoding_map*)mapping;
7336 int l1 = c>>11;
7337 int l2 = (c>>7) & 0xF;
7338 int l3 = c & 0x7F;
7339 int i;
7340
7341#ifdef Py_UNICODE_WIDE
7342 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007344 }
7345#endif
7346 if (c == 0)
7347 return 0;
7348 /* level 1*/
7349 i = map->level1[l1];
7350 if (i == 0xFF) {
7351 return -1;
7352 }
7353 /* level 2*/
7354 i = map->level23[16*i+l2];
7355 if (i == 0xFF) {
7356 return -1;
7357 }
7358 /* level 3 */
7359 i = map->level23[16*map->count2 + 128*i + l3];
7360 if (i == 0) {
7361 return -1;
7362 }
7363 return i;
7364}
7365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007366/* Lookup the character ch in the mapping. If the character
7367 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007368 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369static PyObject *
7370charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371{
Christian Heimes217cfd12007-12-02 14:31:20 +00007372 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007373 PyObject *x;
7374
7375 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007377 x = PyObject_GetItem(mapping, w);
7378 Py_DECREF(w);
7379 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7381 /* No mapping found means: mapping is undefined. */
7382 PyErr_Clear();
7383 x = Py_None;
7384 Py_INCREF(x);
7385 return x;
7386 } else
7387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007389 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007390 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007391 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 long value = PyLong_AS_LONG(x);
7393 if (value < 0 || value > 255) {
7394 PyErr_SetString(PyExc_TypeError,
7395 "character mapping must be in range(256)");
7396 Py_DECREF(x);
7397 return NULL;
7398 }
7399 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007401 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007402 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 /* wrong return value */
7405 PyErr_Format(PyExc_TypeError,
7406 "character mapping must return integer, bytes or None, not %.400s",
7407 x->ob_type->tp_name);
7408 Py_DECREF(x);
7409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 }
7411}
7412
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007413static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007414charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007415{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007416 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7417 /* exponentially overallocate to minimize reallocations */
7418 if (requiredsize < 2*outsize)
7419 requiredsize = 2*outsize;
7420 if (_PyBytes_Resize(outobj, requiredsize))
7421 return -1;
7422 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007423}
7424
Benjamin Peterson14339b62009-01-31 16:36:08 +00007425typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007427} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007428/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007429 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007430 space is available. Return a new reference to the object that
7431 was put in the output buffer, or Py_None, if the mapping was undefined
7432 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007433 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007434static charmapencode_result
7435charmapencode_output(Py_UNICODE c, PyObject *mapping,
7436 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007438 PyObject *rep;
7439 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007440 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441
Christian Heimes90aa7642007-12-19 02:45:37 +00007442 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007443 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007445 if (res == -1)
7446 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 if (outsize<requiredsize)
7448 if (charmapencode_resize(outobj, outpos, requiredsize))
7449 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007450 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 outstart[(*outpos)++] = (char)res;
7452 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453 }
7454
7455 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007456 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007458 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 Py_DECREF(rep);
7460 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007461 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 if (PyLong_Check(rep)) {
7463 Py_ssize_t requiredsize = *outpos+1;
7464 if (outsize<requiredsize)
7465 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7466 Py_DECREF(rep);
7467 return enc_EXCEPTION;
7468 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007469 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 else {
7473 const char *repchars = PyBytes_AS_STRING(rep);
7474 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7475 Py_ssize_t requiredsize = *outpos+repsize;
7476 if (outsize<requiredsize)
7477 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7478 Py_DECREF(rep);
7479 return enc_EXCEPTION;
7480 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007481 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 memcpy(outstart + *outpos, repchars, repsize);
7483 *outpos += repsize;
7484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007485 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007486 Py_DECREF(rep);
7487 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007488}
7489
7490/* handle an error in PyUnicode_EncodeCharmap
7491 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007492static int
7493charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007494 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007495 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007496 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007497 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498{
7499 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007500 Py_ssize_t repsize;
7501 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007502 Py_UNICODE *uni2;
7503 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007504 Py_ssize_t collstartpos = *inpos;
7505 Py_ssize_t collendpos = *inpos+1;
7506 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507 char *encoding = "charmap";
7508 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007509 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007511 /* find all unencodable characters */
7512 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007513 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007514 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 int res = encoding_map_lookup(p[collendpos], mapping);
7516 if (res != -1)
7517 break;
7518 ++collendpos;
7519 continue;
7520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007521
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 rep = charmapencode_lookup(p[collendpos], mapping);
7523 if (rep==NULL)
7524 return -1;
7525 else if (rep!=Py_None) {
7526 Py_DECREF(rep);
7527 break;
7528 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007529 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007531 }
7532 /* cache callback name lookup
7533 * (if not done yet, i.e. it's the first error) */
7534 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 if ((errors==NULL) || (!strcmp(errors, "strict")))
7536 *known_errorHandler = 1;
7537 else if (!strcmp(errors, "replace"))
7538 *known_errorHandler = 2;
7539 else if (!strcmp(errors, "ignore"))
7540 *known_errorHandler = 3;
7541 else if (!strcmp(errors, "xmlcharrefreplace"))
7542 *known_errorHandler = 4;
7543 else
7544 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545 }
7546 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007547 case 1: /* strict */
7548 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7549 return -1;
7550 case 2: /* replace */
7551 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 x = charmapencode_output('?', mapping, res, respos);
7553 if (x==enc_EXCEPTION) {
7554 return -1;
7555 }
7556 else if (x==enc_FAILED) {
7557 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7558 return -1;
7559 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007560 }
7561 /* fall through */
7562 case 3: /* ignore */
7563 *inpos = collendpos;
7564 break;
7565 case 4: /* xmlcharrefreplace */
7566 /* generate replacement (temporarily (mis)uses p) */
7567 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 char buffer[2+29+1+1];
7569 char *cp;
7570 sprintf(buffer, "&#%d;", (int)p[collpos]);
7571 for (cp = buffer; *cp; ++cp) {
7572 x = charmapencode_output(*cp, mapping, res, respos);
7573 if (x==enc_EXCEPTION)
7574 return -1;
7575 else if (x==enc_FAILED) {
7576 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7577 return -1;
7578 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007579 }
7580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007581 *inpos = collendpos;
7582 break;
7583 default:
7584 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 encoding, reason, p, size, exceptionObject,
7586 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007587 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007589 if (PyBytes_Check(repunicode)) {
7590 /* Directly copy bytes result to output. */
7591 Py_ssize_t outsize = PyBytes_Size(*res);
7592 Py_ssize_t requiredsize;
7593 repsize = PyBytes_Size(repunicode);
7594 requiredsize = *respos + repsize;
7595 if (requiredsize > outsize)
7596 /* Make room for all additional bytes. */
7597 if (charmapencode_resize(res, respos, requiredsize)) {
7598 Py_DECREF(repunicode);
7599 return -1;
7600 }
7601 memcpy(PyBytes_AsString(*res) + *respos,
7602 PyBytes_AsString(repunicode), repsize);
7603 *respos += repsize;
7604 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007605 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007606 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 /* generate replacement */
7609 repsize = PyUnicode_GET_SIZE(repunicode);
7610 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 x = charmapencode_output(*uni2, mapping, res, respos);
7612 if (x==enc_EXCEPTION) {
7613 return -1;
7614 }
7615 else if (x==enc_FAILED) {
7616 Py_DECREF(repunicode);
7617 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7618 return -1;
7619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 }
7621 *inpos = newpos;
7622 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007623 }
7624 return 0;
7625}
7626
Alexander Belopolsky40018472011-02-26 01:02:56 +00007627PyObject *
7628PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7629 Py_ssize_t size,
7630 PyObject *mapping,
7631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 /* output object */
7634 PyObject *res = NULL;
7635 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007636 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007638 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007639 PyObject *errorHandler = NULL;
7640 PyObject *exc = NULL;
7641 /* the following variable is used for caching string comparisons
7642 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7643 * 3=ignore, 4=xmlcharrefreplace */
7644 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
7646 /* Default to Latin-1 */
7647 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007650 /* allocate enough for a simple encoding without
7651 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007652 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653 if (res == NULL)
7654 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007655 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 /* try to encode it */
7660 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7661 if (x==enc_EXCEPTION) /* error */
7662 goto onError;
7663 if (x==enc_FAILED) { /* unencodable character */
7664 if (charmap_encoding_error(p, size, &inpos, mapping,
7665 &exc,
7666 &known_errorHandler, &errorHandler, errors,
7667 &res, &respos)) {
7668 goto onError;
7669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 else
7672 /* done with this character => adjust input position */
7673 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007677 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007678 if (_PyBytes_Resize(&res, respos) < 0)
7679 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007680
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007681 Py_XDECREF(exc);
7682 Py_XDECREF(errorHandler);
7683 return res;
7684
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007686 Py_XDECREF(res);
7687 Py_XDECREF(exc);
7688 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689 return NULL;
7690}
7691
Alexander Belopolsky40018472011-02-26 01:02:56 +00007692PyObject *
7693PyUnicode_AsCharmapString(PyObject *unicode,
7694 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695{
7696 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 PyErr_BadArgument();
7698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699 }
7700 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 PyUnicode_GET_SIZE(unicode),
7702 mapping,
7703 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704}
7705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007707static void
7708make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007709 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007710 Py_ssize_t startpos, Py_ssize_t endpos,
7711 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007714 *exceptionObject = _PyUnicodeTranslateError_Create(
7715 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 }
7717 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7719 goto onError;
7720 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7721 goto onError;
7722 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7723 goto onError;
7724 return;
7725 onError:
7726 Py_DECREF(*exceptionObject);
7727 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 }
7729}
7730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007731/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007732static void
7733raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007735 Py_ssize_t startpos, Py_ssize_t endpos,
7736 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737{
7738 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007739 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007740 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742}
7743
7744/* error handling callback helper:
7745 build arguments, call the callback and check the arguments,
7746 put the result into newpos and return the replacement string, which
7747 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007748static PyObject *
7749unicode_translate_call_errorhandler(const char *errors,
7750 PyObject **errorHandler,
7751 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007753 Py_ssize_t startpos, Py_ssize_t endpos,
7754 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007755{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007756 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007758 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007759 PyObject *restuple;
7760 PyObject *resunicode;
7761
7762 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007764 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007766 }
7767
7768 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007769 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007770 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007772
7773 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007775 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007778 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 Py_DECREF(restuple);
7780 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007781 }
7782 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 &resunicode, &i_newpos)) {
7784 Py_DECREF(restuple);
7785 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007786 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007787 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007788 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007789 else
7790 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007791 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7793 Py_DECREF(restuple);
7794 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007796 Py_INCREF(resunicode);
7797 Py_DECREF(restuple);
7798 return resunicode;
7799}
7800
7801/* Lookup the character ch in the mapping and put the result in result,
7802 which must be decrefed by the caller.
7803 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007804static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007806{
Christian Heimes217cfd12007-12-02 14:31:20 +00007807 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808 PyObject *x;
7809
7810 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 x = PyObject_GetItem(mapping, w);
7813 Py_DECREF(w);
7814 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7816 /* No mapping found means: use 1:1 mapping. */
7817 PyErr_Clear();
7818 *result = NULL;
7819 return 0;
7820 } else
7821 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007822 }
7823 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 *result = x;
7825 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007827 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 long value = PyLong_AS_LONG(x);
7829 long max = PyUnicode_GetMax();
7830 if (value < 0 || value > max) {
7831 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007832 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 Py_DECREF(x);
7834 return -1;
7835 }
7836 *result = x;
7837 return 0;
7838 }
7839 else if (PyUnicode_Check(x)) {
7840 *result = x;
7841 return 0;
7842 }
7843 else {
7844 /* wrong return value */
7845 PyErr_SetString(PyExc_TypeError,
7846 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 Py_DECREF(x);
7848 return -1;
7849 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007850}
7851/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 if not reallocate and adjust various state variables.
7853 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007854static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007858 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007859 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 /* exponentially overallocate to minimize reallocations */
7861 if (requiredsize < 2 * oldsize)
7862 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7864 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007867 }
7868 return 0;
7869}
7870/* lookup the character, put the result in the output string and adjust
7871 various state variables. Return a new reference to the object that
7872 was put in the output buffer in *result, or Py_None, if the mapping was
7873 undefined (in which case no character was written).
7874 The called must decref result.
7875 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007876static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7878 PyObject *mapping, Py_UCS4 **output,
7879 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007882 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7883 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 }
7889 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007891 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007893 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007894 }
7895 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 Py_ssize_t repsize;
7897 if (PyUnicode_READY(*res) == -1)
7898 return -1;
7899 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 if (repsize==1) {
7901 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007902 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 }
7904 else if (repsize!=0) {
7905 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007906 Py_ssize_t requiredsize = *opos +
7907 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 Py_ssize_t i;
7910 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 for(i = 0; i < repsize; i++)
7913 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007915 }
7916 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007918 return 0;
7919}
7920
Alexander Belopolsky40018472011-02-26 01:02:56 +00007921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007922_PyUnicode_TranslateCharmap(PyObject *input,
7923 PyObject *mapping,
7924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 /* input object */
7927 char *idata;
7928 Py_ssize_t size, i;
7929 int kind;
7930 /* output buffer */
7931 Py_UCS4 *output = NULL;
7932 Py_ssize_t osize;
7933 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007935 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936 char *reason = "character maps to <undefined>";
7937 PyObject *errorHandler = NULL;
7938 PyObject *exc = NULL;
7939 /* the following variable is used for caching string comparisons
7940 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7941 * 3=ignore, 4=xmlcharrefreplace */
7942 int known_errorHandler = -1;
7943
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 PyErr_BadArgument();
7946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 if (PyUnicode_READY(input) == -1)
7950 return NULL;
7951 idata = (char*)PyUnicode_DATA(input);
7952 kind = PyUnicode_KIND(input);
7953 size = PyUnicode_GET_LENGTH(input);
7954 i = 0;
7955
7956 if (size == 0) {
7957 Py_INCREF(input);
7958 return input;
7959 }
7960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961 /* allocate enough for a simple 1:1 translation without
7962 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007963 osize = size;
7964 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7965 opos = 0;
7966 if (output == NULL) {
7967 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 /* try to encode it */
7973 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007974 if (charmaptranslate_output(input, i, mapping,
7975 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 Py_XDECREF(x);
7977 goto onError;
7978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007979 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 else { /* untranslatable character */
7983 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7984 Py_ssize_t repsize;
7985 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007986 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 Py_ssize_t collstart = i;
7989 Py_ssize_t collend = i+1;
7990 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007993 while (collend < size) {
7994 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 goto onError;
7996 Py_XDECREF(x);
7997 if (x!=Py_None)
7998 break;
7999 ++collend;
8000 }
8001 /* cache callback name lookup
8002 * (if not done yet, i.e. it's the first error) */
8003 if (known_errorHandler==-1) {
8004 if ((errors==NULL) || (!strcmp(errors, "strict")))
8005 known_errorHandler = 1;
8006 else if (!strcmp(errors, "replace"))
8007 known_errorHandler = 2;
8008 else if (!strcmp(errors, "ignore"))
8009 known_errorHandler = 3;
8010 else if (!strcmp(errors, "xmlcharrefreplace"))
8011 known_errorHandler = 4;
8012 else
8013 known_errorHandler = 0;
8014 }
8015 switch (known_errorHandler) {
8016 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008017 raise_translate_exception(&exc, input, collstart,
8018 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 case 2: /* replace */
8021 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008022 for (coll = collstart; coll<collend; coll++)
8023 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 /* fall through */
8025 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008026 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 break;
8028 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008029 /* generate replacement (temporarily (mis)uses i) */
8030 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 char buffer[2+29+1+1];
8032 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008033 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8034 if (charmaptranslate_makespace(&output, &osize,
8035 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 goto onError;
8037 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008038 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008040 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 break;
8042 default:
8043 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 reason, input, &exc,
8045 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008046 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 goto onError;
8048 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 repsize = PyUnicode_GET_LENGTH(repunicode);
8050 if (charmaptranslate_makespace(&output, &osize,
8051 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 Py_DECREF(repunicode);
8053 goto onError;
8054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008055 for (uni2 = 0; repsize-->0; ++uni2)
8056 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8057 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 }
8061 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8063 if (!res)
8064 goto onError;
8065 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066 Py_XDECREF(exc);
8067 Py_XDECREF(errorHandler);
8068 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008071 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008072 Py_XDECREF(exc);
8073 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074 return NULL;
8075}
8076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008077/* Deprecated. Use PyUnicode_Translate instead. */
8078PyObject *
8079PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8080 Py_ssize_t size,
8081 PyObject *mapping,
8082 const char *errors)
8083{
8084 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8085 if (!unicode)
8086 return NULL;
8087 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8088}
8089
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090PyObject *
8091PyUnicode_Translate(PyObject *str,
8092 PyObject *mapping,
8093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008094{
8095 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008096
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097 str = PyUnicode_FromObject(str);
8098 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008099 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 Py_DECREF(str);
8102 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008103
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 Py_XDECREF(str);
8106 return NULL;
8107}
Tim Petersced69f82003-09-16 20:30:58 +00008108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008110fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008111{
8112 /* No need to call PyUnicode_READY(self) because this function is only
8113 called as a callback from fixup() which does it already. */
8114 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8115 const int kind = PyUnicode_KIND(self);
8116 void *data = PyUnicode_DATA(self);
8117 Py_UCS4 maxchar = 0, ch, fixed;
8118 Py_ssize_t i;
8119
8120 for (i = 0; i < len; ++i) {
8121 ch = PyUnicode_READ(kind, data, i);
8122 fixed = 0;
8123 if (ch > 127) {
8124 if (Py_UNICODE_ISSPACE(ch))
8125 fixed = ' ';
8126 else {
8127 const int decimal = Py_UNICODE_TODECIMAL(ch);
8128 if (decimal >= 0)
8129 fixed = '0' + decimal;
8130 }
8131 if (fixed != 0) {
8132 if (fixed > maxchar)
8133 maxchar = fixed;
8134 PyUnicode_WRITE(kind, data, i, fixed);
8135 }
8136 else if (ch > maxchar)
8137 maxchar = ch;
8138 }
8139 else if (ch > maxchar)
8140 maxchar = ch;
8141 }
8142
8143 return maxchar;
8144}
8145
8146PyObject *
8147_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8148{
8149 if (!PyUnicode_Check(unicode)) {
8150 PyErr_BadInternalCall();
8151 return NULL;
8152 }
8153 if (PyUnicode_READY(unicode) == -1)
8154 return NULL;
8155 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8156 /* If the string is already ASCII, just return the same string */
8157 Py_INCREF(unicode);
8158 return unicode;
8159 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008160 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161}
8162
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008163PyObject *
8164PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8165 Py_ssize_t length)
8166{
8167 PyObject *result;
8168 Py_UNICODE *p; /* write pointer into result */
8169 Py_ssize_t i;
8170 /* Copy to a new string */
8171 result = (PyObject *)_PyUnicode_New(length);
8172 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8173 if (result == NULL)
8174 return result;
8175 p = PyUnicode_AS_UNICODE(result);
8176 /* Iterate over code points */
8177 for (i = 0; i < length; i++) {
8178 Py_UNICODE ch =s[i];
8179 if (ch > 127) {
8180 int decimal = Py_UNICODE_TODECIMAL(ch);
8181 if (decimal >= 0)
8182 p[i] = '0' + decimal;
8183 }
8184 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008185#ifndef DONT_MAKE_RESULT_READY
8186 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 Py_DECREF(result);
8188 return NULL;
8189 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008190#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008191 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008192 return result;
8193}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008194/* --- Decimal Encoder ---------------------------------------------------- */
8195
Alexander Belopolsky40018472011-02-26 01:02:56 +00008196int
8197PyUnicode_EncodeDecimal(Py_UNICODE *s,
8198 Py_ssize_t length,
8199 char *output,
8200 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008201{
8202 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008203 PyObject *errorHandler = NULL;
8204 PyObject *exc = NULL;
8205 const char *encoding = "decimal";
8206 const char *reason = "invalid decimal Unicode string";
8207 /* the following variable is used for caching string comparisons
8208 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8209 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008210
8211 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 PyErr_BadArgument();
8213 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008214 }
8215
8216 p = s;
8217 end = s + length;
8218 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 register Py_UNICODE ch = *p;
8220 int decimal;
8221 PyObject *repunicode;
8222 Py_ssize_t repsize;
8223 Py_ssize_t newpos;
8224 Py_UNICODE *uni2;
8225 Py_UNICODE *collstart;
8226 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008227
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 ++p;
8231 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 decimal = Py_UNICODE_TODECIMAL(ch);
8234 if (decimal >= 0) {
8235 *output++ = '0' + decimal;
8236 ++p;
8237 continue;
8238 }
8239 if (0 < ch && ch < 256) {
8240 *output++ = (char)ch;
8241 ++p;
8242 continue;
8243 }
8244 /* All other characters are considered unencodable */
8245 collstart = p;
8246 collend = p+1;
8247 while (collend < end) {
8248 if ((0 < *collend && *collend < 256) ||
8249 !Py_UNICODE_ISSPACE(*collend) ||
8250 Py_UNICODE_TODECIMAL(*collend))
8251 break;
8252 }
8253 /* cache callback name lookup
8254 * (if not done yet, i.e. it's the first error) */
8255 if (known_errorHandler==-1) {
8256 if ((errors==NULL) || (!strcmp(errors, "strict")))
8257 known_errorHandler = 1;
8258 else if (!strcmp(errors, "replace"))
8259 known_errorHandler = 2;
8260 else if (!strcmp(errors, "ignore"))
8261 known_errorHandler = 3;
8262 else if (!strcmp(errors, "xmlcharrefreplace"))
8263 known_errorHandler = 4;
8264 else
8265 known_errorHandler = 0;
8266 }
8267 switch (known_errorHandler) {
8268 case 1: /* strict */
8269 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8270 goto onError;
8271 case 2: /* replace */
8272 for (p = collstart; p < collend; ++p)
8273 *output++ = '?';
8274 /* fall through */
8275 case 3: /* ignore */
8276 p = collend;
8277 break;
8278 case 4: /* xmlcharrefreplace */
8279 /* generate replacement (temporarily (mis)uses p) */
8280 for (p = collstart; p < collend; ++p)
8281 output += sprintf(output, "&#%d;", (int)*p);
8282 p = collend;
8283 break;
8284 default:
8285 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8286 encoding, reason, s, length, &exc,
8287 collstart-s, collend-s, &newpos);
8288 if (repunicode == NULL)
8289 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008290 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008291 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008292 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8293 Py_DECREF(repunicode);
8294 goto onError;
8295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 /* generate replacement */
8297 repsize = PyUnicode_GET_SIZE(repunicode);
8298 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8299 Py_UNICODE ch = *uni2;
8300 if (Py_UNICODE_ISSPACE(ch))
8301 *output++ = ' ';
8302 else {
8303 decimal = Py_UNICODE_TODECIMAL(ch);
8304 if (decimal >= 0)
8305 *output++ = '0' + decimal;
8306 else if (0 < ch && ch < 256)
8307 *output++ = (char)ch;
8308 else {
8309 Py_DECREF(repunicode);
8310 raise_encode_exception(&exc, encoding,
8311 s, length, collstart-s, collend-s, reason);
8312 goto onError;
8313 }
8314 }
8315 }
8316 p = s + newpos;
8317 Py_DECREF(repunicode);
8318 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008319 }
8320 /* 0-terminate the output string */
8321 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 Py_XDECREF(exc);
8323 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008324 return 0;
8325
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327 Py_XDECREF(exc);
8328 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008329 return -1;
8330}
8331
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332/* --- Helpers ------------------------------------------------------------ */
8333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334#include "stringlib/ucs1lib.h"
8335#include "stringlib/fastsearch.h"
8336#include "stringlib/partition.h"
8337#include "stringlib/split.h"
8338#include "stringlib/count.h"
8339#include "stringlib/find.h"
8340#include "stringlib/localeutil.h"
8341#include "stringlib/undef.h"
8342
8343#include "stringlib/ucs2lib.h"
8344#include "stringlib/fastsearch.h"
8345#include "stringlib/partition.h"
8346#include "stringlib/split.h"
8347#include "stringlib/count.h"
8348#include "stringlib/find.h"
8349#include "stringlib/localeutil.h"
8350#include "stringlib/undef.h"
8351
8352#include "stringlib/ucs4lib.h"
8353#include "stringlib/fastsearch.h"
8354#include "stringlib/partition.h"
8355#include "stringlib/split.h"
8356#include "stringlib/count.h"
8357#include "stringlib/find.h"
8358#include "stringlib/localeutil.h"
8359#include "stringlib/undef.h"
8360
8361static Py_ssize_t
8362any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8363 const Py_UCS1*, Py_ssize_t,
8364 Py_ssize_t, Py_ssize_t),
8365 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8366 const Py_UCS2*, Py_ssize_t,
8367 Py_ssize_t, Py_ssize_t),
8368 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8369 const Py_UCS4*, Py_ssize_t,
8370 Py_ssize_t, Py_ssize_t),
8371 PyObject* s1, PyObject* s2,
8372 Py_ssize_t start,
8373 Py_ssize_t end)
8374{
8375 int kind1, kind2, kind;
8376 void *buf1, *buf2;
8377 Py_ssize_t len1, len2, result;
8378
8379 kind1 = PyUnicode_KIND(s1);
8380 kind2 = PyUnicode_KIND(s2);
8381 kind = kind1 > kind2 ? kind1 : kind2;
8382 buf1 = PyUnicode_DATA(s1);
8383 buf2 = PyUnicode_DATA(s2);
8384 if (kind1 != kind)
8385 buf1 = _PyUnicode_AsKind(s1, kind);
8386 if (!buf1)
8387 return -2;
8388 if (kind2 != kind)
8389 buf2 = _PyUnicode_AsKind(s2, kind);
8390 if (!buf2) {
8391 if (kind1 != kind) PyMem_Free(buf1);
8392 return -2;
8393 }
8394 len1 = PyUnicode_GET_LENGTH(s1);
8395 len2 = PyUnicode_GET_LENGTH(s2);
8396
8397 switch(kind) {
8398 case PyUnicode_1BYTE_KIND:
8399 result = ucs1(buf1, len1, buf2, len2, start, end);
8400 break;
8401 case PyUnicode_2BYTE_KIND:
8402 result = ucs2(buf1, len1, buf2, len2, start, end);
8403 break;
8404 case PyUnicode_4BYTE_KIND:
8405 result = ucs4(buf1, len1, buf2, len2, start, end);
8406 break;
8407 default:
8408 assert(0); result = -2;
8409 }
8410
8411 if (kind1 != kind)
8412 PyMem_Free(buf1);
8413 if (kind2 != kind)
8414 PyMem_Free(buf2);
8415
8416 return result;
8417}
8418
8419Py_ssize_t
8420_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8421 Py_ssize_t n_buffer,
8422 void *digits, Py_ssize_t n_digits,
8423 Py_ssize_t min_width,
8424 const char *grouping,
8425 const char *thousands_sep)
8426{
8427 switch(kind) {
8428 case PyUnicode_1BYTE_KIND:
8429 return _PyUnicode_ucs1_InsertThousandsGrouping(
8430 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8431 min_width, grouping, thousands_sep);
8432 case PyUnicode_2BYTE_KIND:
8433 return _PyUnicode_ucs2_InsertThousandsGrouping(
8434 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8435 min_width, grouping, thousands_sep);
8436 case PyUnicode_4BYTE_KIND:
8437 return _PyUnicode_ucs4_InsertThousandsGrouping(
8438 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8439 min_width, grouping, thousands_sep);
8440 }
8441 assert(0);
8442 return -1;
8443}
8444
8445
Eric Smith8c663262007-08-25 02:26:07 +00008446#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008447#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008448
Thomas Wouters477c8d52006-05-27 19:21:47 +00008449#include "stringlib/count.h"
8450#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008451
Thomas Wouters477c8d52006-05-27 19:21:47 +00008452/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008453#define ADJUST_INDICES(start, end, len) \
8454 if (end > len) \
8455 end = len; \
8456 else if (end < 0) { \
8457 end += len; \
8458 if (end < 0) \
8459 end = 0; \
8460 } \
8461 if (start < 0) { \
8462 start += len; \
8463 if (start < 0) \
8464 start = 0; \
8465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008466
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467Py_ssize_t
8468PyUnicode_Count(PyObject *str,
8469 PyObject *substr,
8470 Py_ssize_t start,
8471 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008473 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008474 PyUnicodeObject* str_obj;
8475 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 int kind1, kind2, kind;
8477 void *buf1 = NULL, *buf2 = NULL;
8478 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008479
Thomas Wouters477c8d52006-05-27 19:21:47 +00008480 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008483 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008484 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 Py_DECREF(str_obj);
8486 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 }
Tim Petersced69f82003-09-16 20:30:58 +00008488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 kind1 = PyUnicode_KIND(str_obj);
8490 kind2 = PyUnicode_KIND(sub_obj);
8491 kind = kind1 > kind2 ? kind1 : kind2;
8492 buf1 = PyUnicode_DATA(str_obj);
8493 if (kind1 != kind)
8494 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8495 if (!buf1)
8496 goto onError;
8497 buf2 = PyUnicode_DATA(sub_obj);
8498 if (kind2 != kind)
8499 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8500 if (!buf2)
8501 goto onError;
8502 len1 = PyUnicode_GET_LENGTH(str_obj);
8503 len2 = PyUnicode_GET_LENGTH(sub_obj);
8504
8505 ADJUST_INDICES(start, end, len1);
8506 switch(kind) {
8507 case PyUnicode_1BYTE_KIND:
8508 result = ucs1lib_count(
8509 ((Py_UCS1*)buf1) + start, end - start,
8510 buf2, len2, PY_SSIZE_T_MAX
8511 );
8512 break;
8513 case PyUnicode_2BYTE_KIND:
8514 result = ucs2lib_count(
8515 ((Py_UCS2*)buf1) + start, end - start,
8516 buf2, len2, PY_SSIZE_T_MAX
8517 );
8518 break;
8519 case PyUnicode_4BYTE_KIND:
8520 result = ucs4lib_count(
8521 ((Py_UCS4*)buf1) + start, end - start,
8522 buf2, len2, PY_SSIZE_T_MAX
8523 );
8524 break;
8525 default:
8526 assert(0); result = 0;
8527 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008528
8529 Py_DECREF(sub_obj);
8530 Py_DECREF(str_obj);
8531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 if (kind1 != kind)
8533 PyMem_Free(buf1);
8534 if (kind2 != kind)
8535 PyMem_Free(buf2);
8536
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 onError:
8539 Py_DECREF(sub_obj);
8540 Py_DECREF(str_obj);
8541 if (kind1 != kind && buf1)
8542 PyMem_Free(buf1);
8543 if (kind2 != kind && buf2)
8544 PyMem_Free(buf2);
8545 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546}
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548Py_ssize_t
8549PyUnicode_Find(PyObject *str,
8550 PyObject *sub,
8551 Py_ssize_t start,
8552 Py_ssize_t end,
8553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008555 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008556
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008560 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 Py_DECREF(str);
8563 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 }
Tim Petersced69f82003-09-16 20:30:58 +00008565
Thomas Wouters477c8d52006-05-27 19:21:47 +00008566 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 result = any_find_slice(
8568 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8569 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008570 );
8571 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 result = any_find_slice(
8573 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8574 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008575 );
8576
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008578 Py_DECREF(sub);
8579
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 return result;
8581}
8582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008583Py_ssize_t
8584PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8585 Py_ssize_t start, Py_ssize_t end,
8586 int direction)
8587{
8588 char *result;
8589 int kind;
8590 if (PyUnicode_READY(str) == -1)
8591 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008592 if (start < 0 || end < 0) {
8593 PyErr_SetString(PyExc_IndexError, "string index out of range");
8594 return -2;
8595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 if (end > PyUnicode_GET_LENGTH(str))
8597 end = PyUnicode_GET_LENGTH(str);
8598 kind = PyUnicode_KIND(str);
8599 result = findchar(PyUnicode_1BYTE_DATA(str)
8600 + PyUnicode_KIND_SIZE(kind, start),
8601 kind,
8602 end-start, ch, direction);
8603 if (!result)
8604 return -1;
8605 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8606}
8607
Alexander Belopolsky40018472011-02-26 01:02:56 +00008608static int
8609tailmatch(PyUnicodeObject *self,
8610 PyUnicodeObject *substring,
8611 Py_ssize_t start,
8612 Py_ssize_t end,
8613 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 int kind_self;
8616 int kind_sub;
8617 void *data_self;
8618 void *data_sub;
8619 Py_ssize_t offset;
8620 Py_ssize_t i;
8621 Py_ssize_t end_sub;
8622
8623 if (PyUnicode_READY(self) == -1 ||
8624 PyUnicode_READY(substring) == -1)
8625 return 0;
8626
8627 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 return 1;
8629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8631 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 kind_self = PyUnicode_KIND(self);
8636 data_self = PyUnicode_DATA(self);
8637 kind_sub = PyUnicode_KIND(substring);
8638 data_sub = PyUnicode_DATA(substring);
8639 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8640
8641 if (direction > 0)
8642 offset = end;
8643 else
8644 offset = start;
8645
8646 if (PyUnicode_READ(kind_self, data_self, offset) ==
8647 PyUnicode_READ(kind_sub, data_sub, 0) &&
8648 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8649 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8650 /* If both are of the same kind, memcmp is sufficient */
8651 if (kind_self == kind_sub) {
8652 return ! memcmp((char *)data_self +
8653 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8654 data_sub,
8655 PyUnicode_GET_LENGTH(substring) *
8656 PyUnicode_CHARACTER_SIZE(substring));
8657 }
8658 /* otherwise we have to compare each character by first accesing it */
8659 else {
8660 /* We do not need to compare 0 and len(substring)-1 because
8661 the if statement above ensured already that they are equal
8662 when we end up here. */
8663 // TODO: honor direction and do a forward or backwards search
8664 for (i = 1; i < end_sub; ++i) {
8665 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8666 PyUnicode_READ(kind_sub, data_sub, i))
8667 return 0;
8668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
8672
8673 return 0;
8674}
8675
Alexander Belopolsky40018472011-02-26 01:02:56 +00008676Py_ssize_t
8677PyUnicode_Tailmatch(PyObject *str,
8678 PyObject *substr,
8679 Py_ssize_t start,
8680 Py_ssize_t end,
8681 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008683 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008684
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 str = PyUnicode_FromObject(str);
8686 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 substr = PyUnicode_FromObject(substr);
8689 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 Py_DECREF(str);
8691 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 }
Tim Petersced69f82003-09-16 20:30:58 +00008693
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 (PyUnicodeObject *)substr,
8696 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 Py_DECREF(str);
8698 Py_DECREF(substr);
8699 return result;
8700}
8701
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702/* Apply fixfct filter to the Unicode object self and return a
8703 reference to the modified object */
8704
Alexander Belopolsky40018472011-02-26 01:02:56 +00008705static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008706fixup(PyObject *self,
8707 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 PyObject *u;
8710 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 if (PyUnicode_READY(self) == -1)
8713 return NULL;
8714 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8715 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8716 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8721 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 /* fix functions return the new maximum character in a string,
8724 if the kind of the resulting unicode object does not change,
8725 everything is fine. Otherwise we need to change the string kind
8726 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008727 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 if (maxchar_new == 0)
8729 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8730 else if (maxchar_new <= 127)
8731 maxchar_new = 127;
8732 else if (maxchar_new <= 255)
8733 maxchar_new = 255;
8734 else if (maxchar_new <= 65535)
8735 maxchar_new = 65535;
8736 else
8737 maxchar_new = 1114111; /* 0x10ffff */
8738
8739 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 /* fixfct should return TRUE if it modified the buffer. If
8741 FALSE, return a reference to the original buffer instead
8742 (to save space, not time) */
8743 Py_INCREF(self);
8744 Py_DECREF(u);
8745 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 else if (maxchar_new == maxchar_old) {
8748 return u;
8749 }
8750 else {
8751 /* In case the maximum character changed, we need to
8752 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008753 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754 if (v == NULL) {
8755 Py_DECREF(u);
8756 return NULL;
8757 }
8758 if (maxchar_new > maxchar_old) {
8759 /* If the maxchar increased so that the kind changed, not all
8760 characters are representable anymore and we need to fix the
8761 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008762 if (PyUnicode_CopyCharacters(v, 0,
8763 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008764 PyUnicode_GET_LENGTH(self)) < 0)
8765 {
8766 Py_DECREF(u);
8767 return NULL;
8768 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008769 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8771 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008772 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008773 if (PyUnicode_CopyCharacters(v, 0,
8774 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008775 PyUnicode_GET_LENGTH(self)) < 0)
8776 {
8777 Py_DECREF(u);
8778 return NULL;
8779 }
8780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781
8782 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008783 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 return v;
8785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786}
8787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008789fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 /* No need to call PyUnicode_READY(self) because this function is only
8792 called as a callback from fixup() which does it already. */
8793 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8794 const int kind = PyUnicode_KIND(self);
8795 void *data = PyUnicode_DATA(self);
8796 int touched = 0;
8797 Py_UCS4 maxchar = 0;
8798 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 for (i = 0; i < len; ++i) {
8801 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8802 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8803 if (up != ch) {
8804 if (up > maxchar)
8805 maxchar = up;
8806 PyUnicode_WRITE(kind, data, i, up);
8807 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 else if (ch > maxchar)
8810 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008811 }
8812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 if (touched)
8814 return maxchar;
8815 else
8816 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817}
8818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008820fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8823 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8824 const int kind = PyUnicode_KIND(self);
8825 void *data = PyUnicode_DATA(self);
8826 int touched = 0;
8827 Py_UCS4 maxchar = 0;
8828 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 for(i = 0; i < len; ++i) {
8831 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8832 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8833 if (lo != ch) {
8834 if (lo > maxchar)
8835 maxchar = lo;
8836 PyUnicode_WRITE(kind, data, i, lo);
8837 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 else if (ch > maxchar)
8840 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841 }
8842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 if (touched)
8844 return maxchar;
8845 else
8846 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847}
8848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008850fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8853 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8854 const int kind = PyUnicode_KIND(self);
8855 void *data = PyUnicode_DATA(self);
8856 int touched = 0;
8857 Py_UCS4 maxchar = 0;
8858 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 for(i = 0; i < len; ++i) {
8861 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8862 Py_UCS4 nu = 0;
8863
8864 if (Py_UNICODE_ISUPPER(ch))
8865 nu = Py_UNICODE_TOLOWER(ch);
8866 else if (Py_UNICODE_ISLOWER(ch))
8867 nu = Py_UNICODE_TOUPPER(ch);
8868
8869 if (nu != 0) {
8870 if (nu > maxchar)
8871 maxchar = nu;
8872 PyUnicode_WRITE(kind, data, i, nu);
8873 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 else if (ch > maxchar)
8876 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 }
8878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 if (touched)
8880 return maxchar;
8881 else
8882 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883}
8884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008886fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8889 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8890 const int kind = PyUnicode_KIND(self);
8891 void *data = PyUnicode_DATA(self);
8892 int touched = 0;
8893 Py_UCS4 maxchar = 0;
8894 Py_ssize_t i = 0;
8895 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008896
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008897 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899
8900 ch = PyUnicode_READ(kind, data, i);
8901 if (!Py_UNICODE_ISUPPER(ch)) {
8902 maxchar = Py_UNICODE_TOUPPER(ch);
8903 PyUnicode_WRITE(kind, data, i, maxchar);
8904 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 ++i;
8907 for(; i < len; ++i) {
8908 ch = PyUnicode_READ(kind, data, i);
8909 if (!Py_UNICODE_ISLOWER(ch)) {
8910 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8911 if (lo > maxchar)
8912 maxchar = lo;
8913 PyUnicode_WRITE(kind, data, i, lo);
8914 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 else if (ch > maxchar)
8917 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919
8920 if (touched)
8921 return maxchar;
8922 else
8923 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924}
8925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008927fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8930 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8931 const int kind = PyUnicode_KIND(self);
8932 void *data = PyUnicode_DATA(self);
8933 Py_UCS4 maxchar = 0;
8934 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 int previous_is_cased;
8936
8937 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 if (len == 1) {
8939 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8940 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8941 if (ti != ch) {
8942 PyUnicode_WRITE(kind, data, i, ti);
8943 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 }
8945 else
8946 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008948 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 for(; i < len; ++i) {
8950 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8951 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008952
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 nu = Py_UNICODE_TOTITLE(ch);
8957
8958 if (nu > maxchar)
8959 maxchar = nu;
8960 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008961
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 if (Py_UNICODE_ISLOWER(ch) ||
8963 Py_UNICODE_ISUPPER(ch) ||
8964 Py_UNICODE_ISTITLE(ch))
8965 previous_is_cased = 1;
8966 else
8967 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970}
8971
Tim Peters8ce9f162004-08-27 01:49:32 +00008972PyObject *
8973PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008976 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008978 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008979 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8980 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008981 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 Py_ssize_t sz, i, res_offset;
8983 Py_UCS4 maxchar = 0;
8984 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985
Tim Peters05eba1f2004-08-27 21:32:02 +00008986 fseq = PySequence_Fast(seq, "");
8987 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008988 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008989 }
8990
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008991 /* NOTE: the following code can't call back into Python code,
8992 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008993 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008994
Tim Peters05eba1f2004-08-27 21:32:02 +00008995 seqlen = PySequence_Fast_GET_SIZE(fseq);
8996 /* If empty sequence, return u"". */
8997 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008999 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00009000 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009001 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009002 /* If singleton sequence with an exact Unicode, return that. */
9003 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 item = items[0];
9005 if (PyUnicode_CheckExact(item)) {
9006 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 goto Done;
9009 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009010 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009011 else {
9012 /* Set up sep and seplen */
9013 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 /* fall back to a blank space separator */
9015 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02009016 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009018 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009019 else {
9020 if (!PyUnicode_Check(separator)) {
9021 PyErr_Format(PyExc_TypeError,
9022 "separator: expected str instance,"
9023 " %.80s found",
9024 Py_TYPE(separator)->tp_name);
9025 goto onError;
9026 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02009027 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 goto onError;
9029 sep = separator;
9030 seplen = PyUnicode_GET_LENGTH(separator);
9031 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
Georg Brandl7597add2011-10-05 16:36:47 +02009032 /* inc refcount to keep this code path symmetric with the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 above case of a blank separator */
9034 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009035 }
9036 }
9037
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009038 /* There are at least two things to join, or else we have a subclass
9039 * of str in the sequence.
9040 * Do a pre-pass to figure out the total amount of space we'll
9041 * need (sz), and see whether all argument are strings.
9042 */
9043 sz = 0;
9044 for (i = 0; i < seqlen; i++) {
9045 const Py_ssize_t old_sz = sz;
9046 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 if (!PyUnicode_Check(item)) {
9048 PyErr_Format(PyExc_TypeError,
9049 "sequence item %zd: expected str instance,"
9050 " %.80s found",
9051 i, Py_TYPE(item)->tp_name);
9052 goto onError;
9053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 if (PyUnicode_READY(item) == -1)
9055 goto onError;
9056 sz += PyUnicode_GET_LENGTH(item);
9057 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9058 if (item_maxchar > maxchar)
9059 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009060 if (i != 0)
9061 sz += seplen;
9062 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9063 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009065 goto onError;
9066 }
9067 }
Tim Petersced69f82003-09-16 20:30:58 +00009068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009070 if (res == NULL)
9071 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009072
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009073 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009075 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009076 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009078 if (i && seplen != 0) {
9079 copied = PyUnicode_CopyCharacters(res, res_offset,
9080 sep, 0, seplen);
9081 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009082 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009083#ifdef Py_DEBUG
9084 res_offset += copied;
9085#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009087#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009089 itemlen = PyUnicode_GET_LENGTH(item);
9090 if (itemlen != 0) {
9091 copied = PyUnicode_CopyCharacters(res, res_offset,
9092 item, 0, itemlen);
9093 if (copied < 0)
9094 goto onError;
9095#ifdef Py_DEBUG
9096 res_offset += copied;
9097#else
9098 res_offset += itemlen;
9099#endif
9100 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009103
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009105 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009106 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009107 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009111 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009113 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 return NULL;
9115}
9116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117#define FILL(kind, data, value, start, length) \
9118 do { \
9119 Py_ssize_t i_ = 0; \
9120 assert(kind != PyUnicode_WCHAR_KIND); \
9121 switch ((kind)) { \
9122 case PyUnicode_1BYTE_KIND: { \
9123 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9124 memset(to_, (unsigned char)value, length); \
9125 break; \
9126 } \
9127 case PyUnicode_2BYTE_KIND: { \
9128 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9129 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9130 break; \
9131 } \
9132 default: { \
9133 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9134 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9135 break; \
9136 } \
9137 } \
9138 } while (0)
9139
Victor Stinner9310abb2011-10-05 00:59:23 +02009140static PyObject *
9141pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009142 Py_ssize_t left,
9143 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 PyObject *u;
9147 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009148 int kind;
9149 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150
9151 if (left < 0)
9152 left = 0;
9153 if (right < 0)
9154 right = 0;
9155
Tim Peters7a29bd52001-09-12 03:03:31 +00009156 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 Py_INCREF(self);
9158 return self;
9159 }
9160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9162 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009163 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9164 return NULL;
9165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9167 if (fill > maxchar)
9168 maxchar = fill;
9169 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009170 if (!u)
9171 return NULL;
9172
9173 kind = PyUnicode_KIND(u);
9174 data = PyUnicode_DATA(u);
9175 if (left)
9176 FILL(kind, data, fill, 0, left);
9177 if (right)
9178 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009179 if (PyUnicode_CopyCharacters(u, left,
9180 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009181 _PyUnicode_LENGTH(self)) < 0)
9182 {
9183 Py_DECREF(u);
9184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 }
9186
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009187 assert(_PyUnicode_CheckConsistency(u, 1));
9188 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191
Alexander Belopolsky40018472011-02-26 01:02:56 +00009192PyObject *
9193PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196
9197 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009198 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 switch(PyUnicode_KIND(string)) {
9202 case PyUnicode_1BYTE_KIND:
9203 list = ucs1lib_splitlines(
9204 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9205 PyUnicode_GET_LENGTH(string), keepends);
9206 break;
9207 case PyUnicode_2BYTE_KIND:
9208 list = ucs2lib_splitlines(
9209 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9210 PyUnicode_GET_LENGTH(string), keepends);
9211 break;
9212 case PyUnicode_4BYTE_KIND:
9213 list = ucs4lib_splitlines(
9214 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9215 PyUnicode_GET_LENGTH(string), keepends);
9216 break;
9217 default:
9218 assert(0);
9219 list = 0;
9220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 Py_DECREF(string);
9222 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223}
9224
Alexander Belopolsky40018472011-02-26 01:02:56 +00009225static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009226split(PyObject *self,
9227 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009228 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 int kind1, kind2, kind;
9231 void *buf1, *buf2;
9232 Py_ssize_t len1, len2;
9233 PyObject* out;
9234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009236 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 if (PyUnicode_READY(self) == -1)
9239 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 if (substring == NULL)
9242 switch(PyUnicode_KIND(self)) {
9243 case PyUnicode_1BYTE_KIND:
9244 return ucs1lib_split_whitespace(
9245 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9246 PyUnicode_GET_LENGTH(self), maxcount
9247 );
9248 case PyUnicode_2BYTE_KIND:
9249 return ucs2lib_split_whitespace(
9250 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9251 PyUnicode_GET_LENGTH(self), maxcount
9252 );
9253 case PyUnicode_4BYTE_KIND:
9254 return ucs4lib_split_whitespace(
9255 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9256 PyUnicode_GET_LENGTH(self), maxcount
9257 );
9258 default:
9259 assert(0);
9260 return NULL;
9261 }
9262
9263 if (PyUnicode_READY(substring) == -1)
9264 return NULL;
9265
9266 kind1 = PyUnicode_KIND(self);
9267 kind2 = PyUnicode_KIND(substring);
9268 kind = kind1 > kind2 ? kind1 : kind2;
9269 buf1 = PyUnicode_DATA(self);
9270 buf2 = PyUnicode_DATA(substring);
9271 if (kind1 != kind)
9272 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9273 if (!buf1)
9274 return NULL;
9275 if (kind2 != kind)
9276 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9277 if (!buf2) {
9278 if (kind1 != kind) PyMem_Free(buf1);
9279 return NULL;
9280 }
9281 len1 = PyUnicode_GET_LENGTH(self);
9282 len2 = PyUnicode_GET_LENGTH(substring);
9283
9284 switch(kind) {
9285 case PyUnicode_1BYTE_KIND:
9286 out = ucs1lib_split(
9287 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9288 break;
9289 case PyUnicode_2BYTE_KIND:
9290 out = ucs2lib_split(
9291 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9292 break;
9293 case PyUnicode_4BYTE_KIND:
9294 out = ucs4lib_split(
9295 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9296 break;
9297 default:
9298 out = NULL;
9299 }
9300 if (kind1 != kind)
9301 PyMem_Free(buf1);
9302 if (kind2 != kind)
9303 PyMem_Free(buf2);
9304 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305}
9306
Alexander Belopolsky40018472011-02-26 01:02:56 +00009307static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009308rsplit(PyObject *self,
9309 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009310 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 int kind1, kind2, kind;
9313 void *buf1, *buf2;
9314 Py_ssize_t len1, len2;
9315 PyObject* out;
9316
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009317 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009318 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (PyUnicode_READY(self) == -1)
9321 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 if (substring == NULL)
9324 switch(PyUnicode_KIND(self)) {
9325 case PyUnicode_1BYTE_KIND:
9326 return ucs1lib_rsplit_whitespace(
9327 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9328 PyUnicode_GET_LENGTH(self), maxcount
9329 );
9330 case PyUnicode_2BYTE_KIND:
9331 return ucs2lib_rsplit_whitespace(
9332 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9333 PyUnicode_GET_LENGTH(self), maxcount
9334 );
9335 case PyUnicode_4BYTE_KIND:
9336 return ucs4lib_rsplit_whitespace(
9337 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9338 PyUnicode_GET_LENGTH(self), maxcount
9339 );
9340 default:
9341 assert(0);
9342 return NULL;
9343 }
9344
9345 if (PyUnicode_READY(substring) == -1)
9346 return NULL;
9347
9348 kind1 = PyUnicode_KIND(self);
9349 kind2 = PyUnicode_KIND(substring);
9350 kind = kind1 > kind2 ? kind1 : kind2;
9351 buf1 = PyUnicode_DATA(self);
9352 buf2 = PyUnicode_DATA(substring);
9353 if (kind1 != kind)
9354 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9355 if (!buf1)
9356 return NULL;
9357 if (kind2 != kind)
9358 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9359 if (!buf2) {
9360 if (kind1 != kind) PyMem_Free(buf1);
9361 return NULL;
9362 }
9363 len1 = PyUnicode_GET_LENGTH(self);
9364 len2 = PyUnicode_GET_LENGTH(substring);
9365
9366 switch(kind) {
9367 case PyUnicode_1BYTE_KIND:
9368 out = ucs1lib_rsplit(
9369 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9370 break;
9371 case PyUnicode_2BYTE_KIND:
9372 out = ucs2lib_rsplit(
9373 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9374 break;
9375 case PyUnicode_4BYTE_KIND:
9376 out = ucs4lib_rsplit(
9377 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9378 break;
9379 default:
9380 out = NULL;
9381 }
9382 if (kind1 != kind)
9383 PyMem_Free(buf1);
9384 if (kind2 != kind)
9385 PyMem_Free(buf2);
9386 return out;
9387}
9388
9389static Py_ssize_t
9390anylib_find(int kind, void *buf1, Py_ssize_t len1,
9391 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9392{
9393 switch(kind) {
9394 case PyUnicode_1BYTE_KIND:
9395 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9396 case PyUnicode_2BYTE_KIND:
9397 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9398 case PyUnicode_4BYTE_KIND:
9399 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9400 }
9401 assert(0);
9402 return -1;
9403}
9404
9405static Py_ssize_t
9406anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9407 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9408{
9409 switch(kind) {
9410 case PyUnicode_1BYTE_KIND:
9411 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9412 case PyUnicode_2BYTE_KIND:
9413 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9414 case PyUnicode_4BYTE_KIND:
9415 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9416 }
9417 assert(0);
9418 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009419}
9420
Alexander Belopolsky40018472011-02-26 01:02:56 +00009421static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422replace(PyObject *self, PyObject *str1,
9423 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 PyObject *u;
9426 char *sbuf = PyUnicode_DATA(self);
9427 char *buf1 = PyUnicode_DATA(str1);
9428 char *buf2 = PyUnicode_DATA(str2);
9429 int srelease = 0, release1 = 0, release2 = 0;
9430 int skind = PyUnicode_KIND(self);
9431 int kind1 = PyUnicode_KIND(str1);
9432 int kind2 = PyUnicode_KIND(str2);
9433 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9434 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9435 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436
9437 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009440 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 if (skind < kind1)
9443 /* substring too wide to be present */
9444 goto nothing;
9445
9446 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009447 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009448 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009450 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009452 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 Py_UCS4 u1, u2, maxchar;
9454 int mayshrink, rkind;
9455 u1 = PyUnicode_READ_CHAR(str1, 0);
9456 if (!findchar(sbuf, PyUnicode_KIND(self),
9457 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009458 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 u2 = PyUnicode_READ_CHAR(str2, 0);
9460 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9461 /* Replacing u1 with u2 may cause a maxchar reduction in the
9462 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 if (u2 > maxchar) {
9464 maxchar = u2;
9465 mayshrink = 0;
9466 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009467 else
9468 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009470 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009472 if (PyUnicode_CopyCharacters(u, 0,
9473 (PyObject*)self, 0, slen) < 0)
9474 {
9475 Py_DECREF(u);
9476 return NULL;
9477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 rkind = PyUnicode_KIND(u);
9479 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9480 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009481 if (--maxcount < 0)
9482 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 if (mayshrink) {
9486 PyObject *tmp = u;
9487 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9488 PyUnicode_GET_LENGTH(tmp));
9489 Py_DECREF(tmp);
9490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 int rkind = skind;
9493 char *res;
9494 if (kind1 < rkind) {
9495 /* widen substring */
9496 buf1 = _PyUnicode_AsKind(str1, rkind);
9497 if (!buf1) goto error;
9498 release1 = 1;
9499 }
9500 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009501 if (i < 0)
9502 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 if (rkind > kind2) {
9504 /* widen replacement */
9505 buf2 = _PyUnicode_AsKind(str2, rkind);
9506 if (!buf2) goto error;
9507 release2 = 1;
9508 }
9509 else if (rkind < kind2) {
9510 /* widen self and buf1 */
9511 rkind = kind2;
9512 if (release1) PyMem_Free(buf1);
9513 sbuf = _PyUnicode_AsKind(self, rkind);
9514 if (!sbuf) goto error;
9515 srelease = 1;
9516 buf1 = _PyUnicode_AsKind(str1, rkind);
9517 if (!buf1) goto error;
9518 release1 = 1;
9519 }
9520 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9521 if (!res) {
9522 PyErr_NoMemory();
9523 goto error;
9524 }
9525 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009526 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9528 buf2,
9529 PyUnicode_KIND_SIZE(rkind, len2));
9530 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009531
9532 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9534 slen-i,
9535 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009536 if (i == -1)
9537 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9539 buf2,
9540 PyUnicode_KIND_SIZE(rkind, len2));
9541 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543
9544 u = PyUnicode_FromKindAndData(rkind, res, slen);
9545 PyMem_Free(res);
9546 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 Py_ssize_t n, i, j, ires;
9551 Py_ssize_t product, new_size;
9552 int rkind = skind;
9553 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 if (kind1 < rkind) {
9556 buf1 = _PyUnicode_AsKind(str1, rkind);
9557 if (!buf1) goto error;
9558 release1 = 1;
9559 }
9560 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009561 if (n == 0)
9562 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 if (kind2 < rkind) {
9564 buf2 = _PyUnicode_AsKind(str2, rkind);
9565 if (!buf2) goto error;
9566 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 else if (kind2 > rkind) {
9569 rkind = kind2;
9570 sbuf = _PyUnicode_AsKind(self, rkind);
9571 if (!sbuf) goto error;
9572 srelease = 1;
9573 if (release1) PyMem_Free(buf1);
9574 buf1 = _PyUnicode_AsKind(str1, rkind);
9575 if (!buf1) goto error;
9576 release1 = 1;
9577 }
9578 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9579 PyUnicode_GET_LENGTH(str1))); */
9580 product = n * (len2-len1);
9581 if ((product / (len2-len1)) != n) {
9582 PyErr_SetString(PyExc_OverflowError,
9583 "replace string is too long");
9584 goto error;
9585 }
9586 new_size = slen + product;
9587 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9588 PyErr_SetString(PyExc_OverflowError,
9589 "replace string is too long");
9590 goto error;
9591 }
9592 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9593 if (!res)
9594 goto error;
9595 ires = i = 0;
9596 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009597 while (n-- > 0) {
9598 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 j = anylib_find(rkind,
9600 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9601 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009602 if (j == -1)
9603 break;
9604 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009605 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9607 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9608 PyUnicode_KIND_SIZE(rkind, j-i));
9609 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009610 }
9611 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 if (len2 > 0) {
9613 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9614 buf2,
9615 PyUnicode_KIND_SIZE(rkind, len2));
9616 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009621 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9623 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9624 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009625 } else {
9626 /* interleave */
9627 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9629 buf2,
9630 PyUnicode_KIND_SIZE(rkind, len2));
9631 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009632 if (--n <= 0)
9633 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9635 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9636 PyUnicode_KIND_SIZE(rkind, 1));
9637 ires++;
9638 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9641 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9642 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009645 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 if (srelease)
9648 PyMem_FREE(sbuf);
9649 if (release1)
9650 PyMem_FREE(buf1);
9651 if (release2)
9652 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009653 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009655
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009657 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 if (srelease)
9659 PyMem_FREE(sbuf);
9660 if (release1)
9661 PyMem_FREE(buf1);
9662 if (release2)
9663 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009664 if (PyUnicode_CheckExact(self)) {
9665 Py_INCREF(self);
9666 return (PyObject *) self;
9667 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009668 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 error:
9670 if (srelease && sbuf)
9671 PyMem_FREE(sbuf);
9672 if (release1 && buf1)
9673 PyMem_FREE(buf1);
9674 if (release2 && buf2)
9675 PyMem_FREE(buf2);
9676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
9679/* --- Unicode Object Methods --------------------------------------------- */
9680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009681PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683\n\
9684Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009685characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686
9687static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009688unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009690 return fixup(self, fixtitle);
9691}
9692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009693PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695\n\
9696Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009697have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698
9699static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009700unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 return fixup(self, fixcapitalize);
9703}
9704
9705#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009706PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708\n\
9709Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009710normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
9712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009713unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714{
9715 PyObject *list;
9716 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009717 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719 /* Split into words */
9720 list = split(self, NULL, -1);
9721 if (!list)
9722 return NULL;
9723
9724 /* Capitalize each word */
9725 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9726 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009727 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 if (item == NULL)
9729 goto onError;
9730 Py_DECREF(PyList_GET_ITEM(list, i));
9731 PyList_SET_ITEM(list, i, item);
9732 }
9733
9734 /* Join the words to form a new string */
9735 item = PyUnicode_Join(NULL, list);
9736
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 Py_DECREF(list);
9739 return (PyObject *)item;
9740}
9741#endif
9742
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009743/* Argument converter. Coerces to a single unicode character */
9744
9745static int
9746convert_uc(PyObject *obj, void *addr)
9747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009749 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009750
Benjamin Peterson14339b62009-01-31 16:36:08 +00009751 uniobj = PyUnicode_FromObject(obj);
9752 if (uniobj == NULL) {
9753 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009755 return 0;
9756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009758 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009759 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009760 Py_DECREF(uniobj);
9761 return 0;
9762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009764 Py_DECREF(uniobj);
9765 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009766}
9767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009768PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009771Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009772done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773
9774static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009775unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009777 Py_ssize_t marg, left;
9778 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 Py_UCS4 fillchar = ' ';
9780
Victor Stinnere9a29352011-10-01 02:14:59 +02009781 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783
Victor Stinnere9a29352011-10-01 02:14:59 +02009784 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785 return NULL;
9786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 Py_INCREF(self);
9789 return (PyObject*) self;
9790 }
9791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 left = marg / 2 + (marg & width & 1);
9794
Victor Stinner9310abb2011-10-05 00:59:23 +02009795 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796}
9797
Marc-André Lemburge5034372000-08-08 08:04:29 +00009798#if 0
9799
9800/* This code should go into some future Unicode collation support
9801 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009802 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009803
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009804/* speedy UTF-16 code point order comparison */
9805/* gleaned from: */
9806/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9807
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009808static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009809{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009810 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009811 0, 0, 0, 0, 0, 0, 0, 0,
9812 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009813 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009814};
9815
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816static int
9817unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9818{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009819 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009820
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821 Py_UNICODE *s1 = str1->str;
9822 Py_UNICODE *s2 = str2->str;
9823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 len1 = str1->_base._base.length;
9825 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009826
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009828 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009829
9830 c1 = *s1++;
9831 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009832
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 if (c1 > (1<<11) * 26)
9834 c1 += utf16Fixup[c1>>11];
9835 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009836 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009837 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009838
9839 if (c1 != c2)
9840 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009841
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009842 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843 }
9844
9845 return (len1 < len2) ? -1 : (len1 != len2);
9846}
9847
Marc-André Lemburge5034372000-08-08 08:04:29 +00009848#else
9849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850/* This function assumes that str1 and str2 are readied by the caller. */
9851
Marc-André Lemburge5034372000-08-08 08:04:29 +00009852static int
9853unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 int kind1, kind2;
9856 void *data1, *data2;
9857 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 kind1 = PyUnicode_KIND(str1);
9860 kind2 = PyUnicode_KIND(str2);
9861 data1 = PyUnicode_DATA(str1);
9862 data2 = PyUnicode_DATA(str2);
9863 len1 = PyUnicode_GET_LENGTH(str1);
9864 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 for (i = 0; i < len1 && i < len2; ++i) {
9867 Py_UCS4 c1, c2;
9868 c1 = PyUnicode_READ(kind1, data1, i);
9869 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009870
9871 if (c1 != c2)
9872 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009873 }
9874
9875 return (len1 < len2) ? -1 : (len1 != len2);
9876}
9877
9878#endif
9879
Alexander Belopolsky40018472011-02-26 01:02:56 +00009880int
9881PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9884 if (PyUnicode_READY(left) == -1 ||
9885 PyUnicode_READY(right) == -1)
9886 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009887 return unicode_compare((PyUnicodeObject *)left,
9888 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009890 PyErr_Format(PyExc_TypeError,
9891 "Can't compare %.100s and %.100s",
9892 left->ob_type->tp_name,
9893 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 return -1;
9895}
9896
Martin v. Löwis5b222132007-06-10 09:51:05 +00009897int
9898PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 Py_ssize_t i;
9901 int kind;
9902 void *data;
9903 Py_UCS4 chr;
9904
Victor Stinner910337b2011-10-03 03:20:16 +02009905 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 if (PyUnicode_READY(uni) == -1)
9907 return -1;
9908 kind = PyUnicode_KIND(uni);
9909 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009910 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9912 if (chr != str[i])
9913 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009914 /* This check keeps Python strings that end in '\0' from comparing equal
9915 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009918 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009920 return 0;
9921}
9922
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009923
Benjamin Peterson29060642009-01-31 22:14:21 +00009924#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009925 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009926
Alexander Belopolsky40018472011-02-26 01:02:56 +00009927PyObject *
9928PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009929{
9930 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009931
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009932 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9933 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 if (PyUnicode_READY(left) == -1 ||
9935 PyUnicode_READY(right) == -1)
9936 return NULL;
9937 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9938 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009939 if (op == Py_EQ) {
9940 Py_INCREF(Py_False);
9941 return Py_False;
9942 }
9943 if (op == Py_NE) {
9944 Py_INCREF(Py_True);
9945 return Py_True;
9946 }
9947 }
9948 if (left == right)
9949 result = 0;
9950 else
9951 result = unicode_compare((PyUnicodeObject *)left,
9952 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009953
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009954 /* Convert the return value to a Boolean */
9955 switch (op) {
9956 case Py_EQ:
9957 v = TEST_COND(result == 0);
9958 break;
9959 case Py_NE:
9960 v = TEST_COND(result != 0);
9961 break;
9962 case Py_LE:
9963 v = TEST_COND(result <= 0);
9964 break;
9965 case Py_GE:
9966 v = TEST_COND(result >= 0);
9967 break;
9968 case Py_LT:
9969 v = TEST_COND(result == -1);
9970 break;
9971 case Py_GT:
9972 v = TEST_COND(result == 1);
9973 break;
9974 default:
9975 PyErr_BadArgument();
9976 return NULL;
9977 }
9978 Py_INCREF(v);
9979 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009980 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009981
Brian Curtindfc80e32011-08-10 20:28:54 -05009982 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009983}
9984
Alexander Belopolsky40018472011-02-26 01:02:56 +00009985int
9986PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009987{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009988 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 int kind1, kind2, kind;
9990 void *buf1, *buf2;
9991 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009992 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009993
9994 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009995 sub = PyUnicode_FromObject(element);
9996 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 PyErr_Format(PyExc_TypeError,
9998 "'in <string>' requires string as left operand, not %s",
9999 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010000 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (PyUnicode_READY(sub) == -1)
10003 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010004
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010006 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010007 Py_DECREF(sub);
10008 return -1;
10009 }
10010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 kind1 = PyUnicode_KIND(str);
10012 kind2 = PyUnicode_KIND(sub);
10013 kind = kind1 > kind2 ? kind1 : kind2;
10014 buf1 = PyUnicode_DATA(str);
10015 buf2 = PyUnicode_DATA(sub);
10016 if (kind1 != kind)
10017 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10018 if (!buf1) {
10019 Py_DECREF(sub);
10020 return -1;
10021 }
10022 if (kind2 != kind)
10023 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10024 if (!buf2) {
10025 Py_DECREF(sub);
10026 if (kind1 != kind) PyMem_Free(buf1);
10027 return -1;
10028 }
10029 len1 = PyUnicode_GET_LENGTH(str);
10030 len2 = PyUnicode_GET_LENGTH(sub);
10031
10032 switch(kind) {
10033 case PyUnicode_1BYTE_KIND:
10034 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10035 break;
10036 case PyUnicode_2BYTE_KIND:
10037 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10038 break;
10039 case PyUnicode_4BYTE_KIND:
10040 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10041 break;
10042 default:
10043 result = -1;
10044 assert(0);
10045 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010046
10047 Py_DECREF(str);
10048 Py_DECREF(sub);
10049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (kind1 != kind)
10051 PyMem_Free(buf1);
10052 if (kind2 != kind)
10053 PyMem_Free(buf2);
10054
Guido van Rossum403d68b2000-03-13 15:55:09 +000010055 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010056}
10057
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058/* Concat to string or Unicode object giving a new Unicode object. */
10059
Alexander Belopolsky40018472011-02-26 01:02:56 +000010060PyObject *
10061PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 PyObject *u = NULL, *v = NULL, *w;
10064 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065
10066 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010069 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010072 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010073
10074 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010075 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010079 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 }
10083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010085 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 w = PyUnicode_New(
10089 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10090 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010092 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010093 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10094 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010095 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010096 v, 0,
10097 PyUnicode_GET_LENGTH(v)) < 0)
10098 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099 Py_DECREF(u);
10100 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010101 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105 Py_XDECREF(u);
10106 Py_XDECREF(v);
10107 return NULL;
10108}
10109
Victor Stinnerb0923652011-10-04 01:17:31 +020010110static void
10111unicode_append_inplace(PyObject **p_left, PyObject *right)
10112{
10113 Py_ssize_t left_len, right_len, new_len;
10114#ifdef Py_DEBUG
10115 Py_ssize_t copied;
10116#endif
10117
10118 assert(PyUnicode_IS_READY(*p_left));
10119 assert(PyUnicode_IS_READY(right));
10120
10121 left_len = PyUnicode_GET_LENGTH(*p_left);
10122 right_len = PyUnicode_GET_LENGTH(right);
10123 if (left_len > PY_SSIZE_T_MAX - right_len) {
10124 PyErr_SetString(PyExc_OverflowError,
10125 "strings are too large to concat");
10126 goto error;
10127 }
10128 new_len = left_len + right_len;
10129
10130 /* Now we own the last reference to 'left', so we can resize it
10131 * in-place.
10132 */
10133 if (unicode_resize(p_left, new_len) != 0) {
10134 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10135 * deallocated so it cannot be put back into
10136 * 'variable'. The MemoryError is raised when there
10137 * is no value in 'variable', which might (very
10138 * remotely) be a cause of incompatibilities.
10139 */
10140 goto error;
10141 }
10142 /* copy 'right' into the newly allocated area of 'left' */
10143#ifdef Py_DEBUG
10144 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10145 right, 0,
10146 right_len);
10147 assert(0 <= copied);
10148#else
10149 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10150#endif
10151 return;
10152
10153error:
10154 Py_DECREF(*p_left);
10155 *p_left = NULL;
10156}
10157
Walter Dörwald1ab83302007-05-18 17:15:44 +000010158void
Victor Stinner23e56682011-10-03 03:54:37 +020010159PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010160{
Victor Stinner23e56682011-10-03 03:54:37 +020010161 PyObject *left, *res;
10162
10163 if (p_left == NULL) {
10164 if (!PyErr_Occurred())
10165 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010166 return;
10167 }
Victor Stinner23e56682011-10-03 03:54:37 +020010168 left = *p_left;
10169 if (right == NULL || !PyUnicode_Check(left)) {
10170 if (!PyErr_Occurred())
10171 PyErr_BadInternalCall();
10172 goto error;
10173 }
10174
Victor Stinnere1335c72011-10-04 20:53:03 +020010175 if (PyUnicode_READY(left))
10176 goto error;
10177 if (PyUnicode_READY(right))
10178 goto error;
10179
Victor Stinner23e56682011-10-03 03:54:37 +020010180 if (PyUnicode_CheckExact(left) && left != unicode_empty
10181 && PyUnicode_CheckExact(right) && right != unicode_empty
10182 && unicode_resizable(left)
10183 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10184 || _PyUnicode_WSTR(left) != NULL))
10185 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010186 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10187 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010188 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010189 not so different than duplicating the string. */
10190 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010191 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010192 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010193 if (p_left != NULL)
10194 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010195 return;
10196 }
10197 }
10198
10199 res = PyUnicode_Concat(left, right);
10200 if (res == NULL)
10201 goto error;
10202 Py_DECREF(left);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010203 assert(_PyUnicode_CheckConsistency(res, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010204 *p_left = res;
10205 return;
10206
10207error:
10208 Py_DECREF(*p_left);
10209 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010210}
10211
10212void
10213PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10214{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010215 PyUnicode_Append(pleft, right);
10216 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010217}
10218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010219PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010220 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010221\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010223string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010224interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
10226static PyObject *
10227unicode_count(PyUnicodeObject *self, PyObject *args)
10228{
10229 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010230 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010231 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 int kind1, kind2, kind;
10234 void *buf1, *buf2;
10235 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236
Jesus Ceaac451502011-04-20 17:09:23 +020010237 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10238 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 kind1 = PyUnicode_KIND(self);
10242 kind2 = PyUnicode_KIND(substring);
10243 kind = kind1 > kind2 ? kind1 : kind2;
10244 buf1 = PyUnicode_DATA(self);
10245 buf2 = PyUnicode_DATA(substring);
10246 if (kind1 != kind)
10247 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10248 if (!buf1) {
10249 Py_DECREF(substring);
10250 return NULL;
10251 }
10252 if (kind2 != kind)
10253 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10254 if (!buf2) {
10255 Py_DECREF(substring);
10256 if (kind1 != kind) PyMem_Free(buf1);
10257 return NULL;
10258 }
10259 len1 = PyUnicode_GET_LENGTH(self);
10260 len2 = PyUnicode_GET_LENGTH(substring);
10261
10262 ADJUST_INDICES(start, end, len1);
10263 switch(kind) {
10264 case PyUnicode_1BYTE_KIND:
10265 iresult = ucs1lib_count(
10266 ((Py_UCS1*)buf1) + start, end - start,
10267 buf2, len2, PY_SSIZE_T_MAX
10268 );
10269 break;
10270 case PyUnicode_2BYTE_KIND:
10271 iresult = ucs2lib_count(
10272 ((Py_UCS2*)buf1) + start, end - start,
10273 buf2, len2, PY_SSIZE_T_MAX
10274 );
10275 break;
10276 case PyUnicode_4BYTE_KIND:
10277 iresult = ucs4lib_count(
10278 ((Py_UCS4*)buf1) + start, end - start,
10279 buf2, len2, PY_SSIZE_T_MAX
10280 );
10281 break;
10282 default:
10283 assert(0); iresult = 0;
10284 }
10285
10286 result = PyLong_FromSsize_t(iresult);
10287
10288 if (kind1 != kind)
10289 PyMem_Free(buf1);
10290 if (kind2 != kind)
10291 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292
10293 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295 return result;
10296}
10297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010298PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010299 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010301Encode S using the codec registered for encoding. Default encoding\n\
10302is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010303handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010304a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10305'xmlcharrefreplace' as well as any other name registered with\n\
10306codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307
10308static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010309unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010311 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312 char *encoding = NULL;
10313 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010314
Benjamin Peterson308d6372009-09-18 21:42:35 +000010315 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10316 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010318 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010319}
10320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010321PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010322 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323\n\
10324Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010325If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
10327static PyObject*
10328unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10329{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010330 Py_ssize_t i, j, line_pos, src_len, incr;
10331 Py_UCS4 ch;
10332 PyObject *u;
10333 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010335 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010336 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337
10338 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340
Antoine Pitrou22425222011-10-04 19:10:51 +020010341 if (PyUnicode_READY(self) == -1)
10342 return NULL;
10343
Thomas Wouters7e474022000-07-16 12:04:32 +000010344 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010345 src_len = PyUnicode_GET_LENGTH(self);
10346 i = j = line_pos = 0;
10347 kind = PyUnicode_KIND(self);
10348 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010349 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010350 for (; i < src_len; i++) {
10351 ch = PyUnicode_READ(kind, src_data, i);
10352 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010353 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010355 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010357 goto overflow;
10358 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010360 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010363 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010364 goto overflow;
10365 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010367 if (ch == '\n' || ch == '\r')
10368 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010370 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010371 if (!found && PyUnicode_CheckExact(self)) {
10372 Py_INCREF((PyObject *) self);
10373 return (PyObject *) self;
10374 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010375
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010377 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378 if (!u)
10379 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010380 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381
Antoine Pitroue71d5742011-10-04 15:55:09 +020010382 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383
Antoine Pitroue71d5742011-10-04 15:55:09 +020010384 for (; i < src_len; i++) {
10385 ch = PyUnicode_READ(kind, src_data, i);
10386 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010387 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010388 incr = tabsize - (line_pos % tabsize);
10389 line_pos += incr;
10390 while (incr--) {
10391 PyUnicode_WRITE(kind, dest_data, j, ' ');
10392 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010393 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010395 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010397 line_pos++;
10398 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010399 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010400 if (ch == '\n' || ch == '\r')
10401 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010403 }
10404 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010405#ifndef DONT_MAKE_RESULT_READY
10406 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 Py_DECREF(u);
10408 return NULL;
10409 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010410#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010411 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010413
Antoine Pitroue71d5742011-10-04 15:55:09 +020010414 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010415 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417}
10418
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010419PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010420 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421\n\
10422Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010423such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424arguments start and end are interpreted as in slice notation.\n\
10425\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010426Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
10428static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430{
Jesus Ceaac451502011-04-20 17:09:23 +020010431 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010432 Py_ssize_t start;
10433 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010434 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
Jesus Ceaac451502011-04-20 17:09:23 +020010436 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10437 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (PyUnicode_READY(self) == -1)
10441 return NULL;
10442 if (PyUnicode_READY(substring) == -1)
10443 return NULL;
10444
10445 result = any_find_slice(
10446 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10447 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449
10450 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (result == -2)
10453 return NULL;
10454
Christian Heimes217cfd12007-12-02 14:31:20 +000010455 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010456}
10457
10458static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010459unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010461 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10462 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010465}
10466
Guido van Rossumc2504932007-09-18 19:42:40 +000010467/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010468 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010469static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010470unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010471{
Guido van Rossumc2504932007-09-18 19:42:40 +000010472 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010473 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (_PyUnicode_HASH(self) != -1)
10476 return _PyUnicode_HASH(self);
10477 if (PyUnicode_READY(self) == -1)
10478 return -1;
10479 len = PyUnicode_GET_LENGTH(self);
10480
10481 /* The hash function as a macro, gets expanded three times below. */
10482#define HASH(P) \
10483 x = (Py_uhash_t)*P << 7; \
10484 while (--len >= 0) \
10485 x = (1000003*x) ^ (Py_uhash_t)*P++;
10486
10487 switch (PyUnicode_KIND(self)) {
10488 case PyUnicode_1BYTE_KIND: {
10489 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10490 HASH(c);
10491 break;
10492 }
10493 case PyUnicode_2BYTE_KIND: {
10494 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10495 HASH(s);
10496 break;
10497 }
10498 default: {
10499 Py_UCS4 *l;
10500 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10501 "Impossible switch case in unicode_hash");
10502 l = PyUnicode_4BYTE_DATA(self);
10503 HASH(l);
10504 break;
10505 }
10506 }
10507 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10508
Guido van Rossumc2504932007-09-18 19:42:40 +000010509 if (x == -1)
10510 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010512 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010513}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010516PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010517 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010519Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520
10521static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010524 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010525 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010526 Py_ssize_t start;
10527 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528
Jesus Ceaac451502011-04-20 17:09:23 +020010529 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10530 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (PyUnicode_READY(self) == -1)
10534 return NULL;
10535 if (PyUnicode_READY(substring) == -1)
10536 return NULL;
10537
10538 result = any_find_slice(
10539 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10540 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542
10543 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (result == -2)
10546 return NULL;
10547
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 if (result < 0) {
10549 PyErr_SetString(PyExc_ValueError, "substring not found");
10550 return NULL;
10551 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552
Christian Heimes217cfd12007-12-02 14:31:20 +000010553 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554}
10555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010556PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010557 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010559Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010560at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
10562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010563unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 Py_ssize_t i, length;
10566 int kind;
10567 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 int cased;
10569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (PyUnicode_READY(self) == -1)
10571 return NULL;
10572 length = PyUnicode_GET_LENGTH(self);
10573 kind = PyUnicode_KIND(self);
10574 data = PyUnicode_DATA(self);
10575
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (length == 1)
10578 return PyBool_FromLong(
10579 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010581 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010584
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 for (i = 0; i < length; i++) {
10587 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010588
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10590 return PyBool_FromLong(0);
10591 else if (!cased && Py_UNICODE_ISLOWER(ch))
10592 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010594 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595}
10596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010597PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010600Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010601at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
10603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010604unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 Py_ssize_t i, length;
10607 int kind;
10608 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609 int cased;
10610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (PyUnicode_READY(self) == -1)
10612 return NULL;
10613 length = PyUnicode_GET_LENGTH(self);
10614 kind = PyUnicode_KIND(self);
10615 data = PyUnicode_DATA(self);
10616
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (length == 1)
10619 return PyBool_FromLong(
10620 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010622 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010625
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 for (i = 0; i < length; i++) {
10628 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010629
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10631 return PyBool_FromLong(0);
10632 else if (!cased && Py_UNICODE_ISUPPER(ch))
10633 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010635 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636}
10637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010638PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010641Return True if S is a titlecased string and there is at least one\n\
10642character in S, i.e. upper- and titlecase characters may only\n\
10643follow uncased characters and lowercase characters only cased ones.\n\
10644Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645
10646static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010647unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 Py_ssize_t i, length;
10650 int kind;
10651 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652 int cased, previous_is_cased;
10653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 if (PyUnicode_READY(self) == -1)
10655 return NULL;
10656 length = PyUnicode_GET_LENGTH(self);
10657 kind = PyUnicode_KIND(self);
10658 data = PyUnicode_DATA(self);
10659
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 if (length == 1) {
10662 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10663 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10664 (Py_UNICODE_ISUPPER(ch) != 0));
10665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010667 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010670
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 cased = 0;
10672 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 for (i = 0; i < length; i++) {
10674 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010675
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10677 if (previous_is_cased)
10678 return PyBool_FromLong(0);
10679 previous_is_cased = 1;
10680 cased = 1;
10681 }
10682 else if (Py_UNICODE_ISLOWER(ch)) {
10683 if (!previous_is_cased)
10684 return PyBool_FromLong(0);
10685 previous_is_cased = 1;
10686 cased = 1;
10687 }
10688 else
10689 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010691 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692}
10693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010694PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010695 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010697Return True if all characters in S are whitespace\n\
10698and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699
10700static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010701unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 Py_ssize_t i, length;
10704 int kind;
10705 void *data;
10706
10707 if (PyUnicode_READY(self) == -1)
10708 return NULL;
10709 length = PyUnicode_GET_LENGTH(self);
10710 kind = PyUnicode_KIND(self);
10711 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (length == 1)
10715 return PyBool_FromLong(
10716 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010718 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010720 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 for (i = 0; i < length; i++) {
10723 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010724 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010727 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728}
10729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010730PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010731 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010732\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010733Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010734and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010735
10736static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010737unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 Py_ssize_t i, length;
10740 int kind;
10741 void *data;
10742
10743 if (PyUnicode_READY(self) == -1)
10744 return NULL;
10745 length = PyUnicode_GET_LENGTH(self);
10746 kind = PyUnicode_KIND(self);
10747 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010748
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010749 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (length == 1)
10751 return PyBool_FromLong(
10752 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010753
10754 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 for (i = 0; i < length; i++) {
10759 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010762 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010763}
10764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010765PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010767\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010768Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010769and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010770
10771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010772unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 int kind;
10775 void *data;
10776 Py_ssize_t len, i;
10777
10778 if (PyUnicode_READY(self) == -1)
10779 return NULL;
10780
10781 kind = PyUnicode_KIND(self);
10782 data = PyUnicode_DATA(self);
10783 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010784
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010785 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (len == 1) {
10787 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10788 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10789 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010790
10791 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 for (i = 0; i < len; i++) {
10796 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010797 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010799 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010800 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010801}
10802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010803PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010804 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010806Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808
10809static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010810unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 Py_ssize_t i, length;
10813 int kind;
10814 void *data;
10815
10816 if (PyUnicode_READY(self) == -1)
10817 return NULL;
10818 length = PyUnicode_GET_LENGTH(self);
10819 kind = PyUnicode_KIND(self);
10820 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 if (length == 1)
10824 return PyBool_FromLong(
10825 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010827 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010829 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 for (i = 0; i < length; i++) {
10832 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010835 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836}
10837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010841Return True if all characters in S are digits\n\
10842and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
10844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010845unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 Py_ssize_t i, length;
10848 int kind;
10849 void *data;
10850
10851 if (PyUnicode_READY(self) == -1)
10852 return NULL;
10853 length = PyUnicode_GET_LENGTH(self);
10854 kind = PyUnicode_KIND(self);
10855 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 if (length == 1) {
10859 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10860 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010863 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 for (i = 0; i < length; i++) {
10868 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010869 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010871 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872}
10873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010874PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010877Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010878False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
10880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010881unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 Py_ssize_t i, length;
10884 int kind;
10885 void *data;
10886
10887 if (PyUnicode_READY(self) == -1)
10888 return NULL;
10889 length = PyUnicode_GET_LENGTH(self);
10890 kind = PyUnicode_KIND(self);
10891 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (length == 1)
10895 return PyBool_FromLong(
10896 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010898 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 for (i = 0; i < length; i++) {
10903 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907}
10908
Martin v. Löwis47383402007-08-15 07:32:56 +000010909int
10910PyUnicode_IsIdentifier(PyObject *self)
10911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 int kind;
10913 void *data;
10914 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010915 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010917 if (PyUnicode_READY(self) == -1) {
10918 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920 }
10921
10922 /* Special case for empty strings */
10923 if (PyUnicode_GET_LENGTH(self) == 0)
10924 return 0;
10925 kind = PyUnicode_KIND(self);
10926 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010927
10928 /* PEP 3131 says that the first character must be in
10929 XID_Start and subsequent characters in XID_Continue,
10930 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010931 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010932 letters, digits, underscore). However, given the current
10933 definition of XID_Start and XID_Continue, it is sufficient
10934 to check just for these, except that _ must be allowed
10935 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010937 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010938 return 0;
10939
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010940 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010943 return 1;
10944}
10945
10946PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010947 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010948\n\
10949Return True if S is a valid identifier according\n\
10950to the language definition.");
10951
10952static PyObject*
10953unicode_isidentifier(PyObject *self)
10954{
10955 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10956}
10957
Georg Brandl559e5d72008-06-11 18:37:52 +000010958PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010959 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010960\n\
10961Return True if all characters in S are considered\n\
10962printable in repr() or S is empty, False otherwise.");
10963
10964static PyObject*
10965unicode_isprintable(PyObject *self)
10966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 Py_ssize_t i, length;
10968 int kind;
10969 void *data;
10970
10971 if (PyUnicode_READY(self) == -1)
10972 return NULL;
10973 length = PyUnicode_GET_LENGTH(self);
10974 kind = PyUnicode_KIND(self);
10975 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010976
10977 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 if (length == 1)
10979 return PyBool_FromLong(
10980 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 for (i = 0; i < length; i++) {
10983 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010984 Py_RETURN_FALSE;
10985 }
10986 }
10987 Py_RETURN_TRUE;
10988}
10989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010990PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010991 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992\n\
10993Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010994iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995
10996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010997unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010999 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000}
11001
Martin v. Löwis18e16552006-02-15 17:27:45 +000011002static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003unicode_length(PyUnicodeObject *self)
11004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (PyUnicode_READY(self) == -1)
11006 return -1;
11007 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008}
11009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011010PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011013Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011014done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
11016static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011017unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011019 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 Py_UCS4 fillchar = ' ';
11021
11022 if (PyUnicode_READY(self) == -1)
11023 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011024
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011025 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 return NULL;
11027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 Py_INCREF(self);
11030 return (PyObject*) self;
11031 }
11032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034}
11035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011036PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011039Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
11041static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011042unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044 return fixup(self, fixlower);
11045}
11046
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011047#define LEFTSTRIP 0
11048#define RIGHTSTRIP 1
11049#define BOTHSTRIP 2
11050
11051/* Arrays indexed by above */
11052static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11053
11054#define STRIPNAME(i) (stripformat[i]+3)
11055
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011056/* externally visible for str.strip(unicode) */
11057PyObject *
11058_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 void *data;
11061 int kind;
11062 Py_ssize_t i, j, len;
11063 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11066 return NULL;
11067
11068 kind = PyUnicode_KIND(self);
11069 data = PyUnicode_DATA(self);
11070 len = PyUnicode_GET_LENGTH(self);
11071 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11072 PyUnicode_DATA(sepobj),
11073 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011074
Benjamin Peterson14339b62009-01-31 16:36:08 +000011075 i = 0;
11076 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 while (i < len &&
11078 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011079 i++;
11080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011081 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011082
Benjamin Peterson14339b62009-01-31 16:36:08 +000011083 j = len;
11084 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 do {
11086 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 } while (j >= i &&
11088 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011089 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011090 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011091
Victor Stinner12bab6d2011-10-01 01:53:49 +020011092 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093}
11094
11095PyObject*
11096PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11097{
11098 unsigned char *data;
11099 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011100 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101
Victor Stinnerde636f32011-10-01 03:55:54 +020011102 if (PyUnicode_READY(self) == -1)
11103 return NULL;
11104
11105 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11106
Victor Stinner12bab6d2011-10-01 01:53:49 +020011107 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011109 if (PyUnicode_CheckExact(self)) {
11110 Py_INCREF(self);
11111 return self;
11112 }
11113 else
11114 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 }
11116
Victor Stinner12bab6d2011-10-01 01:53:49 +020011117 length = end - start;
11118 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011119 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120
Victor Stinnerde636f32011-10-01 03:55:54 +020011121 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011122 PyErr_SetString(PyExc_IndexError, "string index out of range");
11123 return NULL;
11124 }
11125
Victor Stinnerb9275c12011-10-05 14:01:42 +020011126 if (PyUnicode_IS_ASCII(self)) {
11127 kind = PyUnicode_KIND(self);
11128 data = PyUnicode_1BYTE_DATA(self);
11129 return unicode_fromascii(data + start, length);
11130 }
11131 else {
11132 kind = PyUnicode_KIND(self);
11133 data = PyUnicode_1BYTE_DATA(self);
11134 return PyUnicode_FromKindAndData(kind,
11135 data + PyUnicode_KIND_SIZE(kind, start),
11136 length);
11137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
11140static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011141do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 int kind;
11144 void *data;
11145 Py_ssize_t len, i, j;
11146
11147 if (PyUnicode_READY(self) == -1)
11148 return NULL;
11149
11150 kind = PyUnicode_KIND(self);
11151 data = PyUnicode_DATA(self);
11152 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011153
Benjamin Peterson14339b62009-01-31 16:36:08 +000011154 i = 0;
11155 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011157 i++;
11158 }
11159 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011160
Benjamin Peterson14339b62009-01-31 16:36:08 +000011161 j = len;
11162 if (striptype != LEFTSTRIP) {
11163 do {
11164 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011166 j++;
11167 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011168
Victor Stinner12bab6d2011-10-01 01:53:49 +020011169 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170}
11171
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011172
11173static PyObject *
11174do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11175{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011176 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011177
Benjamin Peterson14339b62009-01-31 16:36:08 +000011178 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11179 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011180
Benjamin Peterson14339b62009-01-31 16:36:08 +000011181 if (sep != NULL && sep != Py_None) {
11182 if (PyUnicode_Check(sep))
11183 return _PyUnicode_XStrip(self, striptype, sep);
11184 else {
11185 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 "%s arg must be None or str",
11187 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011188 return NULL;
11189 }
11190 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011191
Benjamin Peterson14339b62009-01-31 16:36:08 +000011192 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011193}
11194
11195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011196PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011197 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011198\n\
11199Return a copy of the string S with leading and trailing\n\
11200whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011201If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011202
11203static PyObject *
11204unicode_strip(PyUnicodeObject *self, PyObject *args)
11205{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011206 if (PyTuple_GET_SIZE(args) == 0)
11207 return do_strip(self, BOTHSTRIP); /* Common case */
11208 else
11209 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011210}
11211
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011215\n\
11216Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011217If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011218
11219static PyObject *
11220unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11221{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011222 if (PyTuple_GET_SIZE(args) == 0)
11223 return do_strip(self, LEFTSTRIP); /* Common case */
11224 else
11225 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011226}
11227
11228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011231\n\
11232Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011233If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011234
11235static PyObject *
11236unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11237{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011238 if (PyTuple_GET_SIZE(args) == 0)
11239 return do_strip(self, RIGHTSTRIP); /* Common case */
11240 else
11241 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011242}
11243
11244
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011246unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247{
11248 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
Georg Brandl222de0f2009-04-12 12:01:50 +000011251 if (len < 1) {
11252 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011253 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255
Tim Peters7a29bd52001-09-12 03:03:31 +000011256 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 /* no repeat, return original string */
11258 Py_INCREF(str);
11259 return (PyObject*) str;
11260 }
Tim Peters8f422462000-09-09 06:13:41 +000011261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if (PyUnicode_READY(str) == -1)
11263 return NULL;
11264
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011265 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011266 PyErr_SetString(PyExc_OverflowError,
11267 "repeated string is too long");
11268 return NULL;
11269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 if (!u)
11274 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011275 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (PyUnicode_GET_LENGTH(str) == 1) {
11278 const int kind = PyUnicode_KIND(str);
11279 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11280 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011281 if (kind == PyUnicode_1BYTE_KIND)
11282 memset(to, (unsigned char)fill_char, len);
11283 else {
11284 for (n = 0; n < len; ++n)
11285 PyUnicode_WRITE(kind, to, n, fill_char);
11286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 }
11288 else {
11289 /* number of characters copied this far */
11290 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11291 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11292 char *to = (char *) PyUnicode_DATA(u);
11293 Py_MEMCPY(to, PyUnicode_DATA(str),
11294 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 n = (done <= nchars-done) ? done : nchars-done;
11297 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011298 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 }
11301
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011302 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303 return (PyObject*) u;
11304}
11305
Alexander Belopolsky40018472011-02-26 01:02:56 +000011306PyObject *
11307PyUnicode_Replace(PyObject *obj,
11308 PyObject *subobj,
11309 PyObject *replobj,
11310 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311{
11312 PyObject *self;
11313 PyObject *str1;
11314 PyObject *str2;
11315 PyObject *result;
11316
11317 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011318 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011321 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 Py_DECREF(self);
11323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 }
11325 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011326 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 Py_DECREF(self);
11328 Py_DECREF(str1);
11329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 Py_DECREF(self);
11333 Py_DECREF(str1);
11334 Py_DECREF(str2);
11335 return result;
11336}
11337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011338PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011339 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340\n\
11341Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011342old replaced by new. If the optional argument count is\n\
11343given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
11345static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 PyObject *str1;
11349 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011350 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 PyObject *result;
11352
Martin v. Löwis18e16552006-02-15 17:27:45 +000011353 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 str1 = PyUnicode_FromObject(str1);
11358 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11359 return NULL;
11360 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011361 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 Py_DECREF(str1);
11363 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
11366 result = replace(self, str1, str2, maxcount);
11367
11368 Py_DECREF(str1);
11369 Py_DECREF(str2);
11370 return result;
11371}
11372
Alexander Belopolsky40018472011-02-26 01:02:56 +000011373static PyObject *
11374unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011376 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 Py_ssize_t isize;
11378 Py_ssize_t osize, squote, dquote, i, o;
11379 Py_UCS4 max, quote;
11380 int ikind, okind;
11381 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011384 return NULL;
11385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 isize = PyUnicode_GET_LENGTH(unicode);
11387 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 /* Compute length of output, quote characters, and
11390 maximum character */
11391 osize = 2; /* quotes */
11392 max = 127;
11393 squote = dquote = 0;
11394 ikind = PyUnicode_KIND(unicode);
11395 for (i = 0; i < isize; i++) {
11396 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11397 switch (ch) {
11398 case '\'': squote++; osize++; break;
11399 case '"': dquote++; osize++; break;
11400 case '\\': case '\t': case '\r': case '\n':
11401 osize += 2; break;
11402 default:
11403 /* Fast-path ASCII */
11404 if (ch < ' ' || ch == 0x7f)
11405 osize += 4; /* \xHH */
11406 else if (ch < 0x7f)
11407 osize++;
11408 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11409 osize++;
11410 max = ch > max ? ch : max;
11411 }
11412 else if (ch < 0x100)
11413 osize += 4; /* \xHH */
11414 else if (ch < 0x10000)
11415 osize += 6; /* \uHHHH */
11416 else
11417 osize += 10; /* \uHHHHHHHH */
11418 }
11419 }
11420
11421 quote = '\'';
11422 if (squote) {
11423 if (dquote)
11424 /* Both squote and dquote present. Use squote,
11425 and escape them */
11426 osize += squote;
11427 else
11428 quote = '"';
11429 }
11430
11431 repr = PyUnicode_New(osize, max);
11432 if (repr == NULL)
11433 return NULL;
11434 okind = PyUnicode_KIND(repr);
11435 odata = PyUnicode_DATA(repr);
11436
11437 PyUnicode_WRITE(okind, odata, 0, quote);
11438 PyUnicode_WRITE(okind, odata, osize-1, quote);
11439
11440 for (i = 0, o = 1; i < isize; i++) {
11441 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011442
11443 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if ((ch == quote) || (ch == '\\')) {
11445 PyUnicode_WRITE(okind, odata, o++, '\\');
11446 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011447 continue;
11448 }
11449
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011451 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 PyUnicode_WRITE(okind, odata, o++, '\\');
11453 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011454 }
11455 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 PyUnicode_WRITE(okind, odata, o++, '\\');
11457 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011458 }
11459 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 PyUnicode_WRITE(okind, odata, o++, '\\');
11461 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011462 }
11463
11464 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011465 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 PyUnicode_WRITE(okind, odata, o++, '\\');
11467 PyUnicode_WRITE(okind, odata, o++, 'x');
11468 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11469 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011470 }
11471
Georg Brandl559e5d72008-06-11 18:37:52 +000011472 /* Copy ASCII characters as-is */
11473 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011475 }
11476
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011478 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011479 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011480 (categories Z* and C* except ASCII space)
11481 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011483 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (ch <= 0xff) {
11485 PyUnicode_WRITE(okind, odata, o++, '\\');
11486 PyUnicode_WRITE(okind, odata, o++, 'x');
11487 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11488 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011489 }
11490 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 else if (ch >= 0x10000) {
11492 PyUnicode_WRITE(okind, odata, o++, '\\');
11493 PyUnicode_WRITE(okind, odata, o++, 'U');
11494 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11495 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11496 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11497 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11498 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11499 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11500 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11501 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011502 }
11503 /* Map 16-bit characters to '\uxxxx' */
11504 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 PyUnicode_WRITE(okind, odata, o++, '\\');
11506 PyUnicode_WRITE(okind, odata, o++, 'u');
11507 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11508 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11509 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11510 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011511 }
11512 }
11513 /* Copy characters as-is */
11514 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011516 }
11517 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 /* Closing quote already added at the beginning */
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011520 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011521 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522}
11523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011524PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526\n\
11527Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011528such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529arguments start and end are interpreted as in slice notation.\n\
11530\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
11533static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535{
Jesus Ceaac451502011-04-20 17:09:23 +020011536 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011537 Py_ssize_t start;
11538 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011539 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
Jesus Ceaac451502011-04-20 17:09:23 +020011541 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11542 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011543 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (PyUnicode_READY(self) == -1)
11546 return NULL;
11547 if (PyUnicode_READY(substring) == -1)
11548 return NULL;
11549
11550 result = any_find_slice(
11551 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11552 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011553 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 if (result == -2)
11558 return NULL;
11559
Christian Heimes217cfd12007-12-02 14:31:20 +000011560 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561}
11562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011563PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011566Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
11568static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570{
Jesus Ceaac451502011-04-20 17:09:23 +020011571 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011572 Py_ssize_t start;
11573 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011574 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Jesus Ceaac451502011-04-20 17:09:23 +020011576 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11577 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (PyUnicode_READY(self) == -1)
11581 return NULL;
11582 if (PyUnicode_READY(substring) == -1)
11583 return NULL;
11584
11585 result = any_find_slice(
11586 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11587 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011588 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589
11590 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (result == -2)
11593 return NULL;
11594
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 if (result < 0) {
11596 PyErr_SetString(PyExc_ValueError, "substring not found");
11597 return NULL;
11598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599
Christian Heimes217cfd12007-12-02 14:31:20 +000011600 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601}
11602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011603PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011606Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011607done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
11609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011610unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011612 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 Py_UCS4 fillchar = ' ';
11614
Victor Stinnere9a29352011-10-01 02:14:59 +020011615 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011617
Victor Stinnere9a29352011-10-01 02:14:59 +020011618 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 return NULL;
11620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622 Py_INCREF(self);
11623 return (PyObject*) self;
11624 }
11625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Alexander Belopolsky40018472011-02-26 01:02:56 +000011629PyObject *
11630PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
11632 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011633
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 s = PyUnicode_FromObject(s);
11635 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011636 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 if (sep != NULL) {
11638 sep = PyUnicode_FromObject(sep);
11639 if (sep == NULL) {
11640 Py_DECREF(s);
11641 return NULL;
11642 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 }
11644
Victor Stinner9310abb2011-10-05 00:59:23 +020011645 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
11647 Py_DECREF(s);
11648 Py_XDECREF(sep);
11649 return result;
11650}
11651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654\n\
11655Return a list of the words in S, using sep as the\n\
11656delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011657splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011658whitespace string is a separator and empty strings are\n\
11659removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
11661static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011662unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663{
11664 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011665 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666
Martin v. Löwis18e16552006-02-15 17:27:45 +000011667 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 return NULL;
11669
11670 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011673 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676}
11677
Thomas Wouters477c8d52006-05-27 19:21:47 +000011678PyObject *
11679PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11680{
11681 PyObject* str_obj;
11682 PyObject* sep_obj;
11683 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 int kind1, kind2, kind;
11685 void *buf1 = NULL, *buf2 = NULL;
11686 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011687
11688 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011689 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011691 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011693 Py_DECREF(str_obj);
11694 return NULL;
11695 }
11696
Victor Stinner14f8f022011-10-05 20:58:25 +020011697 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011699 kind = Py_MAX(kind1, kind2);
11700 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011702 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (!buf1)
11704 goto onError;
11705 buf2 = PyUnicode_DATA(sep_obj);
11706 if (kind2 != kind)
11707 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11708 if (!buf2)
11709 goto onError;
11710 len1 = PyUnicode_GET_LENGTH(str_obj);
11711 len2 = PyUnicode_GET_LENGTH(sep_obj);
11712
Victor Stinner14f8f022011-10-05 20:58:25 +020011713 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 case PyUnicode_1BYTE_KIND:
11715 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11716 break;
11717 case PyUnicode_2BYTE_KIND:
11718 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11719 break;
11720 case PyUnicode_4BYTE_KIND:
11721 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11722 break;
11723 default:
11724 assert(0);
11725 out = 0;
11726 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011727
11728 Py_DECREF(sep_obj);
11729 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 if (kind1 != kind)
11731 PyMem_Free(buf1);
11732 if (kind2 != kind)
11733 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011734
11735 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 onError:
11737 Py_DECREF(sep_obj);
11738 Py_DECREF(str_obj);
11739 if (kind1 != kind && buf1)
11740 PyMem_Free(buf1);
11741 if (kind2 != kind && buf2)
11742 PyMem_Free(buf2);
11743 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011744}
11745
11746
11747PyObject *
11748PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11749{
11750 PyObject* str_obj;
11751 PyObject* sep_obj;
11752 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 int kind1, kind2, kind;
11754 void *buf1 = NULL, *buf2 = NULL;
11755 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011756
11757 str_obj = PyUnicode_FromObject(str_in);
11758 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011760 sep_obj = PyUnicode_FromObject(sep_in);
11761 if (!sep_obj) {
11762 Py_DECREF(str_obj);
11763 return NULL;
11764 }
11765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 kind1 = PyUnicode_KIND(str_in);
11767 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011768 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 buf1 = PyUnicode_DATA(str_in);
11770 if (kind1 != kind)
11771 buf1 = _PyUnicode_AsKind(str_in, kind);
11772 if (!buf1)
11773 goto onError;
11774 buf2 = PyUnicode_DATA(sep_obj);
11775 if (kind2 != kind)
11776 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11777 if (!buf2)
11778 goto onError;
11779 len1 = PyUnicode_GET_LENGTH(str_obj);
11780 len2 = PyUnicode_GET_LENGTH(sep_obj);
11781
11782 switch(PyUnicode_KIND(str_in)) {
11783 case PyUnicode_1BYTE_KIND:
11784 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11785 break;
11786 case PyUnicode_2BYTE_KIND:
11787 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11788 break;
11789 case PyUnicode_4BYTE_KIND:
11790 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11791 break;
11792 default:
11793 assert(0);
11794 out = 0;
11795 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011796
11797 Py_DECREF(sep_obj);
11798 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (kind1 != kind)
11800 PyMem_Free(buf1);
11801 if (kind2 != kind)
11802 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011803
11804 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 onError:
11806 Py_DECREF(sep_obj);
11807 Py_DECREF(str_obj);
11808 if (kind1 != kind && buf1)
11809 PyMem_Free(buf1);
11810 if (kind2 != kind && buf2)
11811 PyMem_Free(buf2);
11812 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011813}
11814
11815PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011817\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011818Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011819the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011820found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011821
11822static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011823unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011824{
Victor Stinner9310abb2011-10-05 00:59:23 +020011825 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011826}
11827
11828PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011829 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011830\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011831Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011832the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011833separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011834
11835static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011836unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011837{
Victor Stinner9310abb2011-10-05 00:59:23 +020011838 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011839}
11840
Alexander Belopolsky40018472011-02-26 01:02:56 +000011841PyObject *
11842PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011843{
11844 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011845
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011846 s = PyUnicode_FromObject(s);
11847 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011848 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 if (sep != NULL) {
11850 sep = PyUnicode_FromObject(sep);
11851 if (sep == NULL) {
11852 Py_DECREF(s);
11853 return NULL;
11854 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011855 }
11856
Victor Stinner9310abb2011-10-05 00:59:23 +020011857 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011858
11859 Py_DECREF(s);
11860 Py_XDECREF(sep);
11861 return result;
11862}
11863
11864PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011866\n\
11867Return a list of the words in S, using sep as the\n\
11868delimiter string, starting at the end of the string and\n\
11869working to the front. If maxsplit is given, at most maxsplit\n\
11870splits are done. If sep is not specified, any whitespace string\n\
11871is a separator.");
11872
11873static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011874unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011875{
11876 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011877 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011878
Martin v. Löwis18e16552006-02-15 17:27:45 +000011879 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011880 return NULL;
11881
11882 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011883 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011884 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011885 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011886 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011887 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011888}
11889
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011890PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011891 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892\n\
11893Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011894Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011895is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
11897static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011898unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011900 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011901 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011903 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11904 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 return NULL;
11906
Guido van Rossum86662912000-04-11 15:38:46 +000011907 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908}
11909
11910static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011911PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912{
Walter Dörwald346737f2007-05-31 10:44:43 +000011913 if (PyUnicode_CheckExact(self)) {
11914 Py_INCREF(self);
11915 return self;
11916 } else
11917 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011918 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919}
11920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011921PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923\n\
11924Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011925and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
11927static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011928unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 return fixup(self, fixswapcase);
11931}
11932
Georg Brandlceee0772007-11-27 23:48:05 +000011933PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011935\n\
11936Return a translation table usable for str.translate().\n\
11937If there is only one argument, it must be a dictionary mapping Unicode\n\
11938ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011939Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011940If there are two arguments, they must be strings of equal length, and\n\
11941in the resulting dictionary, each character in x will be mapped to the\n\
11942character at the same position in y. If there is a third argument, it\n\
11943must be a string, whose characters will be mapped to None in the result.");
11944
11945static PyObject*
11946unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11947{
11948 PyObject *x, *y = NULL, *z = NULL;
11949 PyObject *new = NULL, *key, *value;
11950 Py_ssize_t i = 0;
11951 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011952
Georg Brandlceee0772007-11-27 23:48:05 +000011953 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11954 return NULL;
11955 new = PyDict_New();
11956 if (!new)
11957 return NULL;
11958 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 int x_kind, y_kind, z_kind;
11960 void *x_data, *y_data, *z_data;
11961
Georg Brandlceee0772007-11-27 23:48:05 +000011962 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011963 if (!PyUnicode_Check(x)) {
11964 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11965 "be a string if there is a second argument");
11966 goto err;
11967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011969 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11970 "arguments must have equal length");
11971 goto err;
11972 }
11973 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 x_kind = PyUnicode_KIND(x);
11975 y_kind = PyUnicode_KIND(y);
11976 x_data = PyUnicode_DATA(x);
11977 y_data = PyUnicode_DATA(y);
11978 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11979 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11980 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011981 if (!key || !value)
11982 goto err;
11983 res = PyDict_SetItem(new, key, value);
11984 Py_DECREF(key);
11985 Py_DECREF(value);
11986 if (res < 0)
11987 goto err;
11988 }
11989 /* create entries for deleting chars in z */
11990 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 z_kind = PyUnicode_KIND(z);
11992 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011993 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011995 if (!key)
11996 goto err;
11997 res = PyDict_SetItem(new, key, Py_None);
11998 Py_DECREF(key);
11999 if (res < 0)
12000 goto err;
12001 }
12002 }
12003 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 int kind;
12005 void *data;
12006
Georg Brandlceee0772007-11-27 23:48:05 +000012007 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012008 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012009 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12010 "to maketrans it must be a dict");
12011 goto err;
12012 }
12013 /* copy entries into the new dict, converting string keys to int keys */
12014 while (PyDict_Next(x, &i, &key, &value)) {
12015 if (PyUnicode_Check(key)) {
12016 /* convert string keys to integer keys */
12017 PyObject *newkey;
12018 if (PyUnicode_GET_SIZE(key) != 1) {
12019 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12020 "table must be of length 1");
12021 goto err;
12022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 kind = PyUnicode_KIND(key);
12024 data = PyUnicode_DATA(key);
12025 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012026 if (!newkey)
12027 goto err;
12028 res = PyDict_SetItem(new, newkey, value);
12029 Py_DECREF(newkey);
12030 if (res < 0)
12031 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012032 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012033 /* just keep integer keys */
12034 if (PyDict_SetItem(new, key, value) < 0)
12035 goto err;
12036 } else {
12037 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12038 "be strings or integers");
12039 goto err;
12040 }
12041 }
12042 }
12043 return new;
12044 err:
12045 Py_DECREF(new);
12046 return NULL;
12047}
12048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012049PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051\n\
12052Return a copy of the string S, where all characters have been mapped\n\
12053through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012054Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012055Unmapped characters are left untouched. Characters mapped to None\n\
12056are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057
12058static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062}
12063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012064PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012067Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012070unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072 return fixup(self, fixupper);
12073}
12074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012075PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012078Pad a numeric string S with zeros on the left, to fill a field\n\
12079of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
12081static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012082unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012084 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012085 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012086 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 int kind;
12088 void *data;
12089 Py_UCS4 chr;
12090
12091 if (PyUnicode_READY(self) == -1)
12092 return NULL;
12093
Martin v. Löwis18e16552006-02-15 17:27:45 +000012094 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 return NULL;
12096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012098 if (PyUnicode_CheckExact(self)) {
12099 Py_INCREF(self);
12100 return (PyObject*) self;
12101 }
12102 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012103 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104 }
12105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
12108 u = pad(self, fill, 0, '0');
12109
Walter Dörwald068325e2002-04-15 13:36:47 +000012110 if (u == NULL)
12111 return NULL;
12112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 kind = PyUnicode_KIND(u);
12114 data = PyUnicode_DATA(u);
12115 chr = PyUnicode_READ(kind, data, fill);
12116
12117 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 PyUnicode_WRITE(kind, data, 0, chr);
12120 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 }
12122
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012123 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124 return (PyObject*) u;
12125}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
12127#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012128static PyObject *
12129unicode__decimal2ascii(PyObject *self)
12130{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012132}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133#endif
12134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012135PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012138Return True if S starts with the specified prefix, False otherwise.\n\
12139With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012140With optional end, stop comparing S at that position.\n\
12141prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142
12143static PyObject *
12144unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012147 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012149 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012150 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012151 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
Jesus Ceaac451502011-04-20 17:09:23 +020012153 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012155 if (PyTuple_Check(subobj)) {
12156 Py_ssize_t i;
12157 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12158 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012160 if (substring == NULL)
12161 return NULL;
12162 result = tailmatch(self, substring, start, end, -1);
12163 Py_DECREF(substring);
12164 if (result) {
12165 Py_RETURN_TRUE;
12166 }
12167 }
12168 /* nothing matched */
12169 Py_RETURN_FALSE;
12170 }
12171 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012172 if (substring == NULL) {
12173 if (PyErr_ExceptionMatches(PyExc_TypeError))
12174 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12175 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012178 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012180 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181}
12182
12183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012184PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012187Return True if S ends with the specified suffix, False otherwise.\n\
12188With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012189With optional end, stop comparing S at that position.\n\
12190suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
12192static PyObject *
12193unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012196 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012198 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012199 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012200 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Jesus Ceaac451502011-04-20 17:09:23 +020012202 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012204 if (PyTuple_Check(subobj)) {
12205 Py_ssize_t i;
12206 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12207 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012208 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012209 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012211 result = tailmatch(self, substring, start, end, +1);
12212 Py_DECREF(substring);
12213 if (result) {
12214 Py_RETURN_TRUE;
12215 }
12216 }
12217 Py_RETURN_FALSE;
12218 }
12219 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012220 if (substring == NULL) {
12221 if (PyErr_ExceptionMatches(PyExc_TypeError))
12222 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12223 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012225 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012226 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012228 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229}
12230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012232
12233PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012235\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012236Return a formatted version of S, using substitutions from args and kwargs.\n\
12237The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012238
Eric Smith27bbca62010-11-04 17:06:58 +000012239PyDoc_STRVAR(format_map__doc__,
12240 "S.format_map(mapping) -> str\n\
12241\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012242Return a formatted version of S, using substitutions from mapping.\n\
12243The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012244
Eric Smith4a7d76d2008-05-30 18:10:19 +000012245static PyObject *
12246unicode__format__(PyObject* self, PyObject* args)
12247{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012248 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012249
12250 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12251 return NULL;
12252
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012253 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012255 if (out != NULL)
12256 assert(_PyUnicode_CheckConsistency(out, 1));
12257 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012258}
12259
Eric Smith8c663262007-08-25 02:26:07 +000012260PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012262\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012263Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012264
12265static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012266unicode__sizeof__(PyUnicodeObject *v)
12267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 Py_ssize_t size;
12269
12270 /* If it's a compact object, account for base structure +
12271 character data. */
12272 if (PyUnicode_IS_COMPACT_ASCII(v))
12273 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12274 else if (PyUnicode_IS_COMPACT(v))
12275 size = sizeof(PyCompactUnicodeObject) +
12276 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12277 else {
12278 /* If it is a two-block object, account for base object, and
12279 for character block if present. */
12280 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012281 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 size += (PyUnicode_GET_LENGTH(v) + 1) *
12283 PyUnicode_CHARACTER_SIZE(v);
12284 }
12285 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012286 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012287 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012289 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012290 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291
12292 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012293}
12294
12295PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012297
12298static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012299unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012300{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012301 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (!copy)
12303 return NULL;
12304 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012305}
12306
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307static PyMethodDef unicode_methods[] = {
12308
12309 /* Order is according to common usage: often used methods should
12310 appear first, since lookup is done sequentially. */
12311
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012312 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012313 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12314 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012315 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012316 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12317 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12318 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12319 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12320 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12321 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12322 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012324 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12325 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12326 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012327 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012328 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12329 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12330 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012331 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012332 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012333 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012335 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12336 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12337 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12338 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12339 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12340 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12341 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12342 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12343 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12344 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12345 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12346 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12347 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12348 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012349 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012350 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012351 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012352 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012353 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012354 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012355 {"maketrans", (PyCFunction) unicode_maketrans,
12356 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012357 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012358#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012359 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360#endif
12361
12362#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012363 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012364 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365#endif
12366
Benjamin Peterson14339b62009-01-31 16:36:08 +000012367 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368 {NULL, NULL}
12369};
12370
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012371static PyObject *
12372unicode_mod(PyObject *v, PyObject *w)
12373{
Brian Curtindfc80e32011-08-10 20:28:54 -050012374 if (!PyUnicode_Check(v))
12375 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012377}
12378
12379static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012380 0, /*nb_add*/
12381 0, /*nb_subtract*/
12382 0, /*nb_multiply*/
12383 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012384};
12385
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012387 (lenfunc) unicode_length, /* sq_length */
12388 PyUnicode_Concat, /* sq_concat */
12389 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12390 (ssizeargfunc) unicode_getitem, /* sq_item */
12391 0, /* sq_slice */
12392 0, /* sq_ass_item */
12393 0, /* sq_ass_slice */
12394 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012395};
12396
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012397static PyObject*
12398unicode_subscript(PyUnicodeObject* self, PyObject* item)
12399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400 if (PyUnicode_READY(self) == -1)
12401 return NULL;
12402
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012403 if (PyIndex_Check(item)) {
12404 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012405 if (i == -1 && PyErr_Occurred())
12406 return NULL;
12407 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012409 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012410 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012411 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012412 PyObject *result;
12413 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012414 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012415 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012419 return NULL;
12420 }
12421
12422 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 return PyUnicode_New(0, 0);
12424 } else if (start == 0 && step == 1 &&
12425 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012426 PyUnicode_CheckExact(self)) {
12427 Py_INCREF(self);
12428 return (PyObject *)self;
12429 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012430 return PyUnicode_Substring((PyObject*)self,
12431 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012432 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012433 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012434 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012435 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012436 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012437 src_data = PyUnicode_DATA(self);
12438 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12439 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012440 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012441 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012442 if (max_char >= kind_limit)
12443 break;
12444 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012445 }
12446 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012447 if (result == NULL)
12448 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012449 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012450 dest_data = PyUnicode_DATA(result);
12451
12452 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012453 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12454 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012455 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012456 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012457 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012458 } else {
12459 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12460 return NULL;
12461 }
12462}
12463
12464static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012465 (lenfunc)unicode_length, /* mp_length */
12466 (binaryfunc)unicode_subscript, /* mp_subscript */
12467 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012468};
12469
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471/* Helpers for PyUnicode_Format() */
12472
12473static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012474getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012476 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 (*p_argidx)++;
12479 if (arglen < 0)
12480 return args;
12481 else
12482 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483 }
12484 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486 return NULL;
12487}
12488
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012489/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012491static PyObject *
12492formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012494 char *p;
12495 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012497
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498 x = PyFloat_AsDouble(v);
12499 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012500 return NULL;
12501
Guido van Rossumd57fd912000-03-10 22:53:23 +000012502 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012504
Eric Smith0923d1d2009-04-16 20:16:10 +000012505 p = PyOS_double_to_string(x, type, prec,
12506 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012507 if (p == NULL)
12508 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012510 PyMem_Free(p);
12511 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512}
12513
Tim Peters38fd5b62000-09-21 05:43:11 +000012514static PyObject*
12515formatlong(PyObject *val, int flags, int prec, int type)
12516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012517 char *buf;
12518 int len;
12519 PyObject *str; /* temporary string object. */
12520 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012521
Benjamin Peterson14339b62009-01-31 16:36:08 +000012522 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12523 if (!str)
12524 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012526 Py_DECREF(str);
12527 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012528}
12529
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012532 size_t buflen,
12533 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012535 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012536 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 if (PyUnicode_GET_LENGTH(v) == 1) {
12538 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 buf[1] = '\0';
12540 return 1;
12541 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 goto onError;
12543 }
12544 else {
12545 /* Integer input truncated to a character */
12546 long x;
12547 x = PyLong_AsLong(v);
12548 if (x == -1 && PyErr_Occurred())
12549 goto onError;
12550
12551 if (x < 0 || x > 0x10ffff) {
12552 PyErr_SetString(PyExc_OverflowError,
12553 "%c arg not in range(0x110000)");
12554 return -1;
12555 }
12556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012558 buf[1] = '\0';
12559 return 1;
12560 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012561
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012563 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012565 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566}
12567
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012568/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012569 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012570*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012571#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012572
Alexander Belopolsky40018472011-02-26 01:02:56 +000012573PyObject *
12574PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 void *fmt;
12577 int fmtkind;
12578 PyObject *result;
12579 Py_UCS4 *res, *res0;
12580 Py_UCS4 max;
12581 int kind;
12582 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012586
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 PyErr_BadInternalCall();
12589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12592 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 fmt = PyUnicode_DATA(uformat);
12595 fmtkind = PyUnicode_KIND(uformat);
12596 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12597 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598
12599 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12601 if (res0 == NULL) {
12602 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605
12606 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012607 arglen = PyTuple_Size(args);
12608 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609 }
12610 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 arglen = -1;
12612 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012614 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012615 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012616 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
12618 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 if (--rescnt < 0) {
12621 rescnt = fmtcnt + 100;
12622 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12624 if (res0 == NULL){
12625 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 }
12628 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012632 }
12633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 /* Got a format specifier */
12635 int flags = 0;
12636 Py_ssize_t width = -1;
12637 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 Py_UCS4 c = '\0';
12639 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 int isnumok;
12641 PyObject *v = NULL;
12642 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 void *pbuf;
12644 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 Py_ssize_t len, len1;
12647 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 fmtpos++;
12650 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12651 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 Py_ssize_t keylen;
12653 PyObject *key;
12654 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012655
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 if (dict == NULL) {
12657 PyErr_SetString(PyExc_TypeError,
12658 "format requires a mapping");
12659 goto onError;
12660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 /* Skip over balanced parentheses */
12665 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012673 if (fmtcnt < 0 || pcount > 0) {
12674 PyErr_SetString(PyExc_ValueError,
12675 "incomplete format key");
12676 goto onError;
12677 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012678 key = PyUnicode_Substring((PyObject*)uformat,
12679 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 if (key == NULL)
12681 goto onError;
12682 if (args_owned) {
12683 Py_DECREF(args);
12684 args_owned = 0;
12685 }
12686 args = PyObject_GetItem(dict, key);
12687 Py_DECREF(key);
12688 if (args == NULL) {
12689 goto onError;
12690 }
12691 args_owned = 1;
12692 arglen = -1;
12693 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012694 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 case '-': flags |= F_LJUST; continue;
12698 case '+': flags |= F_SIGN; continue;
12699 case ' ': flags |= F_BLANK; continue;
12700 case '#': flags |= F_ALT; continue;
12701 case '0': flags |= F_ZERO; continue;
12702 }
12703 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 if (c == '*') {
12706 v = getnextarg(args, arglen, &argidx);
12707 if (v == NULL)
12708 goto onError;
12709 if (!PyLong_Check(v)) {
12710 PyErr_SetString(PyExc_TypeError,
12711 "* wants int");
12712 goto onError;
12713 }
12714 width = PyLong_AsLong(v);
12715 if (width == -1 && PyErr_Occurred())
12716 goto onError;
12717 if (width < 0) {
12718 flags |= F_LJUST;
12719 width = -width;
12720 }
12721 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 }
12724 else if (c >= '0' && c <= '9') {
12725 width = c - '0';
12726 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 if (c < '0' || c > '9')
12729 break;
12730 if ((width*10) / 10 != width) {
12731 PyErr_SetString(PyExc_ValueError,
12732 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012733 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 }
12735 width = width*10 + (c - '0');
12736 }
12737 }
12738 if (c == '.') {
12739 prec = 0;
12740 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 if (c == '*') {
12743 v = getnextarg(args, arglen, &argidx);
12744 if (v == NULL)
12745 goto onError;
12746 if (!PyLong_Check(v)) {
12747 PyErr_SetString(PyExc_TypeError,
12748 "* wants int");
12749 goto onError;
12750 }
12751 prec = PyLong_AsLong(v);
12752 if (prec == -1 && PyErr_Occurred())
12753 goto onError;
12754 if (prec < 0)
12755 prec = 0;
12756 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 }
12759 else if (c >= '0' && c <= '9') {
12760 prec = c - '0';
12761 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 if (c < '0' || c > '9')
12764 break;
12765 if ((prec*10) / 10 != prec) {
12766 PyErr_SetString(PyExc_ValueError,
12767 "prec too big");
12768 goto onError;
12769 }
12770 prec = prec*10 + (c - '0');
12771 }
12772 }
12773 } /* prec */
12774 if (fmtcnt >= 0) {
12775 if (c == 'h' || c == 'l' || c == 'L') {
12776 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 }
12779 }
12780 if (fmtcnt < 0) {
12781 PyErr_SetString(PyExc_ValueError,
12782 "incomplete format");
12783 goto onError;
12784 }
12785 if (c != '%') {
12786 v = getnextarg(args, arglen, &argidx);
12787 if (v == NULL)
12788 goto onError;
12789 }
12790 sign = 0;
12791 fill = ' ';
12792 switch (c) {
12793
12794 case '%':
12795 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012799 len = 1;
12800 break;
12801
12802 case 's':
12803 case 'r':
12804 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012805 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 temp = v;
12807 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 }
12809 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 if (c == 's')
12811 temp = PyObject_Str(v);
12812 else if (c == 'r')
12813 temp = PyObject_Repr(v);
12814 else
12815 temp = PyObject_ASCII(v);
12816 if (temp == NULL)
12817 goto onError;
12818 if (PyUnicode_Check(temp))
12819 /* nothing to do */;
12820 else {
12821 Py_DECREF(temp);
12822 PyErr_SetString(PyExc_TypeError,
12823 "%s argument has non-string str()");
12824 goto onError;
12825 }
12826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827 if (PyUnicode_READY(temp) == -1) {
12828 Py_CLEAR(temp);
12829 goto onError;
12830 }
12831 pbuf = PyUnicode_DATA(temp);
12832 kind = PyUnicode_KIND(temp);
12833 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 if (prec >= 0 && len > prec)
12835 len = prec;
12836 break;
12837
12838 case 'i':
12839 case 'd':
12840 case 'u':
12841 case 'o':
12842 case 'x':
12843 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012844 isnumok = 0;
12845 if (PyNumber_Check(v)) {
12846 PyObject *iobj=NULL;
12847
12848 if (PyLong_Check(v)) {
12849 iobj = v;
12850 Py_INCREF(iobj);
12851 }
12852 else {
12853 iobj = PyNumber_Long(v);
12854 }
12855 if (iobj!=NULL) {
12856 if (PyLong_Check(iobj)) {
12857 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012858 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 Py_DECREF(iobj);
12860 if (!temp)
12861 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012862 if (PyUnicode_READY(temp) == -1) {
12863 Py_CLEAR(temp);
12864 goto onError;
12865 }
12866 pbuf = PyUnicode_DATA(temp);
12867 kind = PyUnicode_KIND(temp);
12868 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 sign = 1;
12870 }
12871 else {
12872 Py_DECREF(iobj);
12873 }
12874 }
12875 }
12876 if (!isnumok) {
12877 PyErr_Format(PyExc_TypeError,
12878 "%%%c format: a number is required, "
12879 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12880 goto onError;
12881 }
12882 if (flags & F_ZERO)
12883 fill = '0';
12884 break;
12885
12886 case 'e':
12887 case 'E':
12888 case 'f':
12889 case 'F':
12890 case 'g':
12891 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012892 temp = formatfloat(v, flags, prec, c);
12893 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012894 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 if (PyUnicode_READY(temp) == -1) {
12896 Py_CLEAR(temp);
12897 goto onError;
12898 }
12899 pbuf = PyUnicode_DATA(temp);
12900 kind = PyUnicode_KIND(temp);
12901 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012902 sign = 1;
12903 if (flags & F_ZERO)
12904 fill = '0';
12905 break;
12906
12907 case 'c':
12908 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012910 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 if (len < 0)
12912 goto onError;
12913 break;
12914
12915 default:
12916 PyErr_Format(PyExc_ValueError,
12917 "unsupported format character '%c' (0x%x) "
12918 "at index %zd",
12919 (31<=c && c<=126) ? (char)c : '?',
12920 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012922 goto onError;
12923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 /* pbuf is initialized here. */
12925 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12928 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12929 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012930 len--;
12931 }
12932 else if (flags & F_SIGN)
12933 sign = '+';
12934 else if (flags & F_BLANK)
12935 sign = ' ';
12936 else
12937 sign = 0;
12938 }
12939 if (width < len)
12940 width = len;
12941 if (rescnt - (sign != 0) < width) {
12942 reslen -= rescnt;
12943 rescnt = width + fmtcnt + 100;
12944 reslen += rescnt;
12945 if (reslen < 0) {
12946 Py_XDECREF(temp);
12947 PyErr_NoMemory();
12948 goto onError;
12949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12951 if (res0 == 0) {
12952 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012953 Py_XDECREF(temp);
12954 goto onError;
12955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 }
12958 if (sign) {
12959 if (fill != ' ')
12960 *res++ = sign;
12961 rescnt--;
12962 if (width > len)
12963 width--;
12964 }
12965 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12967 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12970 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 }
12972 rescnt -= 2;
12973 width -= 2;
12974 if (width < 0)
12975 width = 0;
12976 len -= 2;
12977 }
12978 if (width > len && !(flags & F_LJUST)) {
12979 do {
12980 --rescnt;
12981 *res++ = fill;
12982 } while (--width > len);
12983 }
12984 if (fill == ' ') {
12985 if (sign)
12986 *res++ = sign;
12987 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12989 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12990 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12991 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012992 }
12993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 /* Copy all characters, preserving len */
12995 len1 = len;
12996 while (len1--) {
12997 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12998 rescnt--;
12999 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 while (--width >= len) {
13001 --rescnt;
13002 *res++ = ' ';
13003 }
13004 if (dict && (argidx < arglen) && c != '%') {
13005 PyErr_SetString(PyExc_TypeError,
13006 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013007 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013008 goto onError;
13009 }
13010 Py_XDECREF(temp);
13011 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012 } /* until end */
13013 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013014 PyErr_SetString(PyExc_TypeError,
13015 "not all arguments converted during string formatting");
13016 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013017 }
13018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019
13020 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13021 if (*res > max)
13022 max = *res;
13023 result = PyUnicode_New(reslen - rescnt, max);
13024 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013025 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 kind = PyUnicode_KIND(result);
13027 for (res = res0; res < res0+reslen-rescnt; res++)
13028 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13029 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013030 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013031 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032 }
13033 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013034 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035 return (PyObject *)result;
13036
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039 Py_DECREF(uformat);
13040 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042 }
13043 return NULL;
13044}
13045
Jeremy Hylton938ace62002-07-17 16:30:39 +000013046static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013047unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13048
Tim Peters6d6c1a32001-08-02 04:15:00 +000013049static PyObject *
13050unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13051{
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013053 static char *kwlist[] = {"object", "encoding", "errors", 0};
13054 char *encoding = NULL;
13055 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013056
Benjamin Peterson14339b62009-01-31 16:36:08 +000013057 if (type != &PyUnicode_Type)
13058 return unicode_subtype_new(type, args, kwds);
13059 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013061 return NULL;
13062 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013064 if (encoding == NULL && errors == NULL)
13065 return PyObject_Str(x);
13066 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013068}
13069
Guido van Rossume023fe02001-08-30 03:12:59 +000013070static PyObject *
13071unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13072{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013073 PyUnicodeObject *unicode, *self;
13074 Py_ssize_t length, char_size;
13075 int share_wstr, share_utf8;
13076 unsigned int kind;
13077 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013078
Benjamin Peterson14339b62009-01-31 16:36:08 +000013079 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013080
13081 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13082 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013083 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013084 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013085 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013086 return NULL;
13087
13088 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13089 if (self == NULL) {
13090 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013091 return NULL;
13092 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013093 kind = PyUnicode_KIND(unicode);
13094 length = PyUnicode_GET_LENGTH(unicode);
13095
13096 _PyUnicode_LENGTH(self) = length;
13097 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13098 _PyUnicode_STATE(self).interned = 0;
13099 _PyUnicode_STATE(self).kind = kind;
13100 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013101 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013102 _PyUnicode_STATE(self).ready = 1;
13103 _PyUnicode_WSTR(self) = NULL;
13104 _PyUnicode_UTF8_LENGTH(self) = 0;
13105 _PyUnicode_UTF8(self) = NULL;
13106 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013107 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013108
13109 share_utf8 = 0;
13110 share_wstr = 0;
13111 if (kind == PyUnicode_1BYTE_KIND) {
13112 char_size = 1;
13113 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13114 share_utf8 = 1;
13115 }
13116 else if (kind == PyUnicode_2BYTE_KIND) {
13117 char_size = 2;
13118 if (sizeof(wchar_t) == 2)
13119 share_wstr = 1;
13120 }
13121 else {
13122 assert(kind == PyUnicode_4BYTE_KIND);
13123 char_size = 4;
13124 if (sizeof(wchar_t) == 4)
13125 share_wstr = 1;
13126 }
13127
13128 /* Ensure we won't overflow the length. */
13129 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13130 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013133 data = PyObject_MALLOC((length + 1) * char_size);
13134 if (data == NULL) {
13135 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 goto onError;
13137 }
13138
Victor Stinnerc3c74152011-10-02 20:39:55 +020013139 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013140 if (share_utf8) {
13141 _PyUnicode_UTF8_LENGTH(self) = length;
13142 _PyUnicode_UTF8(self) = data;
13143 }
13144 if (share_wstr) {
13145 _PyUnicode_WSTR_LENGTH(self) = length;
13146 _PyUnicode_WSTR(self) = (wchar_t *)data;
13147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013149 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13150 PyUnicode_KIND_SIZE(kind, length + 1));
13151 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013152 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013153 return (PyObject *)self;
13154
13155onError:
13156 Py_DECREF(unicode);
13157 Py_DECREF(self);
13158 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013159}
13160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013161PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013162 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013163\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013164Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013165encoding defaults to the current default string encoding.\n\
13166errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013167
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013168static PyObject *unicode_iter(PyObject *seq);
13169
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013171 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 "str", /* tp_name */
13173 sizeof(PyUnicodeObject), /* tp_size */
13174 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176 (destructor)unicode_dealloc, /* tp_dealloc */
13177 0, /* tp_print */
13178 0, /* tp_getattr */
13179 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013180 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 unicode_repr, /* tp_repr */
13182 &unicode_as_number, /* tp_as_number */
13183 &unicode_as_sequence, /* tp_as_sequence */
13184 &unicode_as_mapping, /* tp_as_mapping */
13185 (hashfunc) unicode_hash, /* tp_hash*/
13186 0, /* tp_call*/
13187 (reprfunc) unicode_str, /* tp_str */
13188 PyObject_GenericGetAttr, /* tp_getattro */
13189 0, /* tp_setattro */
13190 0, /* tp_as_buffer */
13191 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013193 unicode_doc, /* tp_doc */
13194 0, /* tp_traverse */
13195 0, /* tp_clear */
13196 PyUnicode_RichCompare, /* tp_richcompare */
13197 0, /* tp_weaklistoffset */
13198 unicode_iter, /* tp_iter */
13199 0, /* tp_iternext */
13200 unicode_methods, /* tp_methods */
13201 0, /* tp_members */
13202 0, /* tp_getset */
13203 &PyBaseObject_Type, /* tp_base */
13204 0, /* tp_dict */
13205 0, /* tp_descr_get */
13206 0, /* tp_descr_set */
13207 0, /* tp_dictoffset */
13208 0, /* tp_init */
13209 0, /* tp_alloc */
13210 unicode_new, /* tp_new */
13211 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212};
13213
13214/* Initialize the Unicode implementation */
13215
Thomas Wouters78890102000-07-22 19:25:51 +000013216void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013218 int i;
13219
Thomas Wouters477c8d52006-05-27 19:21:47 +000013220 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013222 0x000A, /* LINE FEED */
13223 0x000D, /* CARRIAGE RETURN */
13224 0x001C, /* FILE SEPARATOR */
13225 0x001D, /* GROUP SEPARATOR */
13226 0x001E, /* RECORD SEPARATOR */
13227 0x0085, /* NEXT LINE */
13228 0x2028, /* LINE SEPARATOR */
13229 0x2029, /* PARAGRAPH SEPARATOR */
13230 };
13231
Fred Drakee4315f52000-05-09 19:53:39 +000013232 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013233 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013234 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013235 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013237
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013238 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013240 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013241 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013242
13243 /* initialize the linebreak bloom filter */
13244 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013246 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013247
13248 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249}
13250
13251/* Finalize the Unicode implementation */
13252
Christian Heimesa156e092008-02-16 07:38:31 +000013253int
13254PyUnicode_ClearFreeList(void)
13255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013257}
13258
Guido van Rossumd57fd912000-03-10 22:53:23 +000013259void
Thomas Wouters78890102000-07-22 19:25:51 +000013260_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013262 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013263
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013264 Py_XDECREF(unicode_empty);
13265 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013266
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013267 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013268 if (unicode_latin1[i]) {
13269 Py_DECREF(unicode_latin1[i]);
13270 unicode_latin1[i] = NULL;
13271 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013272 }
Christian Heimesa156e092008-02-16 07:38:31 +000013273 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013275
Walter Dörwald16807132007-05-25 13:52:07 +000013276void
13277PyUnicode_InternInPlace(PyObject **p)
13278{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013279 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13280 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013281#ifdef Py_DEBUG
13282 assert(s != NULL);
13283 assert(_PyUnicode_CHECK(s));
13284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013285 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013286 return;
13287#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 /* If it's a subclass, we don't really know what putting
13289 it in the interned dict might do. */
13290 if (!PyUnicode_CheckExact(s))
13291 return;
13292 if (PyUnicode_CHECK_INTERNED(s))
13293 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013294 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013295 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 return;
13297 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013298 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013299 if (interned == NULL) {
13300 interned = PyDict_New();
13301 if (interned == NULL) {
13302 PyErr_Clear(); /* Don't leave an exception */
13303 return;
13304 }
13305 }
13306 /* It might be that the GetItem call fails even
13307 though the key is present in the dictionary,
13308 namely when this happens during a stack overflow. */
13309 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013311 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013312
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 if (t) {
13314 Py_INCREF(t);
13315 Py_DECREF(*p);
13316 *p = t;
13317 return;
13318 }
Walter Dörwald16807132007-05-25 13:52:07 +000013319
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 PyThreadState_GET()->recursion_critical = 1;
13321 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13322 PyErr_Clear();
13323 PyThreadState_GET()->recursion_critical = 0;
13324 return;
13325 }
13326 PyThreadState_GET()->recursion_critical = 0;
13327 /* The two references in interned are not counted by refcnt.
13328 The deallocator will take care of this */
13329 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013331}
13332
13333void
13334PyUnicode_InternImmortal(PyObject **p)
13335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13337
Benjamin Peterson14339b62009-01-31 16:36:08 +000013338 PyUnicode_InternInPlace(p);
13339 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013340 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013341 Py_INCREF(*p);
13342 }
Walter Dörwald16807132007-05-25 13:52:07 +000013343}
13344
13345PyObject *
13346PyUnicode_InternFromString(const char *cp)
13347{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348 PyObject *s = PyUnicode_FromString(cp);
13349 if (s == NULL)
13350 return NULL;
13351 PyUnicode_InternInPlace(&s);
13352 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013353}
13354
Alexander Belopolsky40018472011-02-26 01:02:56 +000013355void
13356_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013357{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013358 PyObject *keys;
13359 PyUnicodeObject *s;
13360 Py_ssize_t i, n;
13361 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013362
Benjamin Peterson14339b62009-01-31 16:36:08 +000013363 if (interned == NULL || !PyDict_Check(interned))
13364 return;
13365 keys = PyDict_Keys(interned);
13366 if (keys == NULL || !PyList_Check(keys)) {
13367 PyErr_Clear();
13368 return;
13369 }
Walter Dörwald16807132007-05-25 13:52:07 +000013370
Benjamin Peterson14339b62009-01-31 16:36:08 +000013371 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13372 detector, interned unicode strings are not forcibly deallocated;
13373 rather, we give them their stolen references back, and then clear
13374 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013375
Benjamin Peterson14339b62009-01-31 16:36:08 +000013376 n = PyList_GET_SIZE(keys);
13377 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013379 for (i = 0; i < n; i++) {
13380 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013381 if (PyUnicode_READY(s) == -1) {
13382 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013386 case SSTATE_NOT_INTERNED:
13387 /* XXX Shouldn't happen */
13388 break;
13389 case SSTATE_INTERNED_IMMORTAL:
13390 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013392 break;
13393 case SSTATE_INTERNED_MORTAL:
13394 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013396 break;
13397 default:
13398 Py_FatalError("Inconsistent interned string state.");
13399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013401 }
13402 fprintf(stderr, "total size of all interned strings: "
13403 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13404 "mortal/immortal\n", mortal_size, immortal_size);
13405 Py_DECREF(keys);
13406 PyDict_Clear(interned);
13407 Py_DECREF(interned);
13408 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013409}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013410
13411
13412/********************* Unicode Iterator **************************/
13413
13414typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 PyObject_HEAD
13416 Py_ssize_t it_index;
13417 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013418} unicodeiterobject;
13419
13420static void
13421unicodeiter_dealloc(unicodeiterobject *it)
13422{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013423 _PyObject_GC_UNTRACK(it);
13424 Py_XDECREF(it->it_seq);
13425 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013426}
13427
13428static int
13429unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13430{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013431 Py_VISIT(it->it_seq);
13432 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013433}
13434
13435static PyObject *
13436unicodeiter_next(unicodeiterobject *it)
13437{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 PyUnicodeObject *seq;
13439 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013440
Benjamin Peterson14339b62009-01-31 16:36:08 +000013441 assert(it != NULL);
13442 seq = it->it_seq;
13443 if (seq == NULL)
13444 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013445 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13448 int kind = PyUnicode_KIND(seq);
13449 void *data = PyUnicode_DATA(seq);
13450 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13451 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013452 if (item != NULL)
13453 ++it->it_index;
13454 return item;
13455 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013456
Benjamin Peterson14339b62009-01-31 16:36:08 +000013457 Py_DECREF(seq);
13458 it->it_seq = NULL;
13459 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013460}
13461
13462static PyObject *
13463unicodeiter_len(unicodeiterobject *it)
13464{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013465 Py_ssize_t len = 0;
13466 if (it->it_seq)
13467 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13468 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013469}
13470
13471PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13472
13473static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013474 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013476 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013477};
13478
13479PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013480 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13481 "str_iterator", /* tp_name */
13482 sizeof(unicodeiterobject), /* tp_basicsize */
13483 0, /* tp_itemsize */
13484 /* methods */
13485 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13486 0, /* tp_print */
13487 0, /* tp_getattr */
13488 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013489 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013490 0, /* tp_repr */
13491 0, /* tp_as_number */
13492 0, /* tp_as_sequence */
13493 0, /* tp_as_mapping */
13494 0, /* tp_hash */
13495 0, /* tp_call */
13496 0, /* tp_str */
13497 PyObject_GenericGetAttr, /* tp_getattro */
13498 0, /* tp_setattro */
13499 0, /* tp_as_buffer */
13500 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13501 0, /* tp_doc */
13502 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13503 0, /* tp_clear */
13504 0, /* tp_richcompare */
13505 0, /* tp_weaklistoffset */
13506 PyObject_SelfIter, /* tp_iter */
13507 (iternextfunc)unicodeiter_next, /* tp_iternext */
13508 unicodeiter_methods, /* tp_methods */
13509 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013510};
13511
13512static PyObject *
13513unicode_iter(PyObject *seq)
13514{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013515 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013516
Benjamin Peterson14339b62009-01-31 16:36:08 +000013517 if (!PyUnicode_Check(seq)) {
13518 PyErr_BadInternalCall();
13519 return NULL;
13520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013521 if (PyUnicode_READY(seq) == -1)
13522 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013523 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13524 if (it == NULL)
13525 return NULL;
13526 it->it_index = 0;
13527 Py_INCREF(seq);
13528 it->it_seq = (PyUnicodeObject *)seq;
13529 _PyObject_GC_TRACK(it);
13530 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013531}
13532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013533#define UNIOP(x) Py_UNICODE_##x
13534#define UNIOP_t Py_UNICODE
13535#include "uniops.h"
13536#undef UNIOP
13537#undef UNIOP_t
13538#define UNIOP(x) Py_UCS4_##x
13539#define UNIOP_t Py_UCS4
13540#include "uniops.h"
13541#undef UNIOP
13542#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013543
Victor Stinner71133ff2010-09-01 23:43:53 +000013544Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013545PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013546{
13547 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13548 Py_UNICODE *copy;
13549 Py_ssize_t size;
13550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013551 if (!PyUnicode_Check(unicode)) {
13552 PyErr_BadArgument();
13553 return NULL;
13554 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013555 /* Ensure we won't overflow the size. */
13556 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13557 PyErr_NoMemory();
13558 return NULL;
13559 }
13560 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13561 size *= sizeof(Py_UNICODE);
13562 copy = PyMem_Malloc(size);
13563 if (copy == NULL) {
13564 PyErr_NoMemory();
13565 return NULL;
13566 }
13567 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13568 return copy;
13569}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013570
Georg Brandl66c221e2010-10-14 07:04:07 +000013571/* A _string module, to export formatter_parser and formatter_field_name_split
13572 to the string.Formatter class implemented in Python. */
13573
13574static PyMethodDef _string_methods[] = {
13575 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13576 METH_O, PyDoc_STR("split the argument as a field name")},
13577 {"formatter_parser", (PyCFunction) formatter_parser,
13578 METH_O, PyDoc_STR("parse the argument as a format string")},
13579 {NULL, NULL}
13580};
13581
13582static struct PyModuleDef _string_module = {
13583 PyModuleDef_HEAD_INIT,
13584 "_string",
13585 PyDoc_STR("string helper module"),
13586 0,
13587 _string_methods,
13588 NULL,
13589 NULL,
13590 NULL,
13591 NULL
13592};
13593
13594PyMODINIT_FUNC
13595PyInit__string(void)
13596{
13597 return PyModule_Create(&_string_module);
13598}
13599
13600
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013601#ifdef __cplusplus
13602}
13603#endif